Java tutorial
/* * Copyright 2003-2016 MarkLogic Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.marklogic.contentpump; import java.io.IOException; import java.nio.charset.Charset; import java.util.Random; import com.marklogic.contentpump.utilities.CommandlineOption; import com.marklogic.contentpump.utilities.AuditUtil; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import com.marklogic.mapreduce.ContentType; import com.marklogic.mapreduce.DocumentURI; import com.marklogic.mapreduce.ForestInputFormat; import com.marklogic.mapreduce.Indentation; import com.marklogic.mapreduce.MarkLogicConstants; import com.marklogic.mapreduce.MarkLogicDocument; import com.marklogic.mapreduce.DatabaseDocument; import com.marklogic.mapreduce.ForestDocument; import com.marklogic.mapreduce.utilities.InternalUtilities; import com.marklogic.xcc.ContentSource; import com.marklogic.xcc.Session; /** * Enum of supported commands. * * @author jchen */ @SuppressWarnings("static-access") public enum Command implements ConfigConstants { IMPORT { @Override public void configOptions(Options options) { configCommonOptions(options); configConnectionId(options); configCopyOptions(options); configCommonOutputOptions(options); configBatchTxn(options); configModule(options); configRDFGraphOutputOptions(options); Option inputFilePath = OptionBuilder.withArgName("path").hasArg() .withDescription("The file system location for input, as a " + "regular expression") .create(INPUT_FILE_PATH); inputFilePath.setRequired(true); options.addOption(inputFilePath); Option inputFilePattern = OptionBuilder.withArgName("regex pattern").hasArg() .withDescription("Matching regex pattern for files found in " + "the input file path") .create(INPUT_FILE_PATTERN); options.addOption(inputFilePattern); Option aggregateRecordElement = OptionBuilder.withArgName("QName").hasArg() .withDescription("Element name in which each document is " + "found") .create(AGGREGATE_RECORD_ELEMENT); options.addOption(aggregateRecordElement); Option aggregateRecordNamespace = OptionBuilder.withArgName("namespace").hasArg() .withDescription("Element namespace in which each document " + "is found") .create(AGGREGATE_RECORD_NAMESPACE); options.addOption(aggregateRecordNamespace); Option aggregateUriId = OptionBuilder.withArgName("QName").hasArg() .withDescription("Deprecated. Name of the first element or attribute " + "within a record element to be used as document URI." + " If omitted, a sequence id will be generated to " + " form the document URI.") .create(AGGREGATE_URI_ID); options.addOption(aggregateUriId); Option inputFileType = OptionBuilder.withArgName("type").hasArg() .withDescription("Type of input file. Valid choices are: " + "aggregates, archive, delimited_text, documents, forest," + "rdf, sequencefile, delimited_json") .create(INPUT_FILE_TYPE); options.addOption(inputFileType); Option inputCompressed = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether the input data is compressed").create(INPUT_COMPRESSED); options.addOption(inputCompressed); Option inputCompressionCodec = OptionBuilder.withArgName("codec").hasArg() .withDescription("Codec used for compression: ZIP, GZIP").create(INPUT_COMPRESSION_CODEC); options.addOption(inputCompressionCodec); Option documentType = OptionBuilder.withArgName("type").hasArg() .withDescription("Type of document content. Valid choices: " + "XML, JSON, TEXT, BINARY, and MIXED. Default type for " + "document is MIXED, where the type is determined " + "from the MIME type mapping configured in MarkLogic " + "Server.") .create(DOCUMENT_TYPE); options.addOption(documentType); Option delimiter = OptionBuilder.withArgName(DELIMITER).hasArg() .withDescription("Delimiter for delimited text.").create(DELIMITER); options.addOption(delimiter); Option delimitedUri = OptionBuilder.withArgName("column name").hasArg() .withDescription("Deprecated. Delimited uri id for delimited text.").create(DELIMITED_URI_ID); options.addOption(delimitedUri); Option delimitedRoot = OptionBuilder.withArgName("root name").hasArg().withDescription( "Root element local name of the XML " + "document constructed from one delimited text record.") .create(DELIMITED_ROOT_NAME); options.addOption(delimitedRoot); Option generateUri = OptionBuilder.withArgName("true, false").hasOptionalArg() .withDescription("Enables automatic URI generation for " + "delimited text records.") .create(GENERATE_URI); options.addOption(generateUri); Option namespace = OptionBuilder.withArgName(NAMESPACE).hasArg() .withDescription("Namespace used for output document.").create(NAMESPACE); options.addOption(namespace); Option outputLanguage = OptionBuilder.withArgName("language").hasArg() .withDescription("Language name to associate with output " + "documents. A value of \"en\" indicates that the " + "documents are in english. The default is null, " + "which indicates the server default.") .create(OUTPUT_LANGUAGE); options.addOption(outputLanguage); Option outputCleanDir = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to clean dir before output.").create(OUTPUT_CLEANDIR); options.addOption(outputCleanDir); Option outputDir = OptionBuilder.withArgName("directory").hasArg() .withDescription("Output directory in MarkLogic.").create(OUTPUT_DIRECTORY); options.addOption(outputDir); Option outputFilenameCollection = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Filename as collection in output.").create(OUTPUT_FILENAME_AS_COLLECTION); options.addOption(outputFilenameCollection); Option repairLevel = OptionBuilder.withArgName("level").hasArg() .withDescription("Whether to repair documents to make it well " + "formed or throw error.") .create(XML_REPAIR_LEVEL); options.addOption(repairLevel); Option seqKeyClass = OptionBuilder.withArgName("class name").hasArg() .withDescription("Name of class to be used as key to read the " + " input SequenceFile") .create(INPUT_SEQUENCEFILE_KEY_CLASS); options.addOption(seqKeyClass); Option seqValueClass = OptionBuilder.withArgName("class name").hasArg() .withDescription("Name of class to be used as value to read " + "the input SequenceFile") .create(INPUT_SEQUENCEFILE_VALUE_CLASS); options.addOption(seqValueClass); Option seqValueType = OptionBuilder.withArgName("value type").hasArg() .withDescription("Type of the value data returned by the above" + " class. Valid choices are: Text, BytesWritable, " + "MarkLogicDocument and Path.") .create(INPUT_SEQUENCEFILE_VALUE_TYPE); options.addOption(seqValueType); Option allowEmptyMeta = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to allow empty metadata when " + "importing archive") .create(ARCHIVE_METADATA_OPTIONAL); options.addOption(allowEmptyMeta); Option fastLoad = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to use the fast load mode to load " + "content into MarkLogic") .create(FAST_LOAD); options.addOption(fastLoad); Option streaming = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to use streaming to output data to" + " MarkLogic").create(STREAMING); options.addOption(streaming); Option encoding = OptionBuilder.withArgName("encoding").hasOptionalArg() .withDescription("The charset encoding to be used by the MarkLogic when " + "loading documents. The default is \"UTF-8\".") .create(CONTENT_ENCODING); options.addOption(encoding); Option uriId = OptionBuilder.withArgName("uri name").hasArg() .withDescription("A column name in delimited text file " + "or an element name in aggregated XML " + "or a property name in delimited json, " + "whose value will be the document uri in MarkLogic Server.") .create(URI_ID); options.addOption(uriId); Option dataType = OptionBuilder.withArgName("data type").hasArg() .withDescription("Comma separated list of column name " + " and data type pairs. 1st to match column name," + " case sensitive. 2nd the data type, case insensitive." + "Data type can be String, Number or Boolean.") .create(DATA_TYPE); options.addOption(dataType); Option threadsPerSplit = OptionBuilder.withArgName("count").hasOptionalArg() .withDescription("The number of threads per split").create(THREADS_PER_SPLIT); options.addOption(threadsPerSplit); Option tolerateErrors = OptionBuilder.withArgName("true,false").hasOptionalArg().withDescription( "Whether to tolerate insertion errors and make sure all " + "successful inserts are committed") .create(TOLERATE_ERRORS); options.addOption(tolerateErrors); Option rdfMemoryThreshold_opt = OptionBuilder.withArgName("threshold").hasArg() .withDescription("Maximum size of an RDF document to be processed in memory") .create(RDF_STREAMING_MEMORY_THRESHOLD); CommandlineOption rdfMemoryThreshold = new CommandlineOption(rdfMemoryThreshold_opt); rdfMemoryThreshold.setHidden(true); options.addOption(rdfMemoryThreshold); Option rdfTriplesPerDoc_opt = OptionBuilder.withArgName("count").hasArg() .withDescription("Maximum number of triples per sem:triples document") .create(RDF_TRIPLES_PER_DOCUMENT); CommandlineOption rdfTriplesPerDoc = new CommandlineOption(rdfTriplesPerDoc_opt); rdfTriplesPerDoc.setHidden(true); options.addOption(rdfTriplesPerDoc); configPartition(options); Option splitInput = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to split input files to load into MarkLogic. " + " Only available for delimited_text. Default is false.") .create(SPLIT_INPUT); options.addOption(splitInput); Option df = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of directories").create(DIRECTORY_FILTER); options.addOption(df); Option cf = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of collections").create(COLLECTION_FILTER); options.addOption(cf); Option tf = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of document types").create(TYPE_FILTER); options.addOption(tf); Option tcf = OptionBuilder.withArgName("String").hasArg().withDescription("temporal collection name") .create(TEMPORAL_COLLECTION); options.addOption(tcf); } @Override public Job createJob(Configuration conf, CommandLine cmdline) throws IOException { applyConfigOptions(conf, cmdline); InputType type = getInputType(cmdline); type.applyConfigOptions(conf, cmdline); // construct a job Job job = Job.getInstance(conf); job.setInputFormatClass(type.getInputFormatClass(cmdline, conf)); job.setOutputFormatClass(type.getOutputFormatClass(cmdline, conf)); job.setJobName(getNewJobName(conf)); // set mapper class setMapperClass(job, conf, cmdline); if (cmdline.hasOption(INPUT_FILE_PATH)) { String path = cmdline.getOptionValue(INPUT_FILE_PATH); FileInputFormat.setInputPaths(job, path); } if (cmdline.hasOption(INPUT_FILE_PATTERN)) { FileInputFormat.setInputPathFilter(job, DocumentPathFilter.class); } return job; } void applyUriId(Configuration conf, InputType inputType, CommandLine cmdline) { String uriId = null; if (cmdline.hasOption(DELIMITED_URI_ID)) { LOG.warn(DELIMITED_URI_ID + " has been depracated, use " + URI_ID); uriId = cmdline.getOptionValue(DELIMITED_URI_ID); } if (cmdline.hasOption(AGGREGATE_URI_ID)) { LOG.warn(AGGREGATE_URI_ID + " has been depracated, use " + URI_ID); uriId = cmdline.getOptionValue(AGGREGATE_URI_ID); } if (cmdline.hasOption(URI_ID)) { uriId = cmdline.getOptionValue(URI_ID); } String generate = null; if (cmdline.hasOption(GENERATE_URI)) { generate = cmdline.getOptionValue(GENERATE_URI); if (generate == null) { generate = "true"; } if (!"true".equalsIgnoreCase(generate) && !"false".equalsIgnoreCase(generate)) { throw new IllegalArgumentException( "Unrecognized option argument for " + GENERATE_URI + ": " + generate); } } if (uriId != null) { if (InputType.AGGREGATES == inputType || InputType.DELIMITED_JSON == inputType || InputType.DELIMITED_TEXT == inputType) { conf.set(CONF_INPUT_URI_ID, uriId); if (InputType.AGGREGATES != inputType && generate != null && "true".equalsIgnoreCase(generate)) { throw new IllegalArgumentException( "Only one of " + GENERATE_URI + " and " + URI_ID + " can be specified"); } } else { throw new IllegalArgumentException(URI_ID + " is not applicable to " + inputType.name()); } } else { if (InputType.DELIMITED_TEXT == inputType) { if ("true".equalsIgnoreCase(generate)) { conf.setBoolean(CONF_INPUT_GENERATE_URI, true); } } else if (InputType.DELIMITED_JSON == inputType) { if (generate != null && "false".equalsIgnoreCase(generate)) { throw new IllegalArgumentException( GENERATE_URI + " must be true if " + URI_ID + " not specified"); } else { conf.setBoolean(CONF_INPUT_GENERATE_URI, true); } } } } @Override public void applyConfigOptions(Configuration conf, CommandLine cmdline) { applyCopyConfigOptions(conf, cmdline); applyCommonOutputConfigOptions(conf, cmdline); applyRDFGraphOutputConfigOptions(conf, cmdline); InputType inputType = getInputType(cmdline); ContentType contentType = inputType.getContentType(cmdline); if (InputType.DELIMITED_TEXT == inputType && ContentType.XML != contentType && contentType.JSON != contentType) { throw new IllegalArgumentException( "The setting for " + DOCUMENT_TYPE + "is not applicable to " + inputType); } applyUriId(conf, inputType, cmdline); if (cmdline.hasOption(DOCUMENT_TYPE) && InputType.DOCUMENTS != inputType && InputType.DELIMITED_TEXT != inputType) { LOG.warn(DOCUMENT_TYPE + " is not supported for " + inputType.name()); } if (cmdline.hasOption(DATA_TYPE)) { if (InputType.DELIMITED_TEXT != inputType) { throw new IllegalArgumentException( DATA_TYPE + " is only applicable to " + InputType.DELIMITED_TEXT.name()); } String type = cmdline.getOptionValue(DOCUMENT_TYPE, ContentType.XML.name()); if ("XML".equalsIgnoreCase(type)) { LOG.warn(DATA_TYPE + " is only applicable when " + DOCUMENT_TYPE + " is " + ContentType.JSON.name()); } else { String value = cmdline.getOptionValue(DATA_TYPE); String[] types = value.split(","); if (types.length % 2 != 0) { throw new IllegalArgumentException( "Invalid option argument for " + DATA_TYPE + ": " + value); } conf.set(CONF_DELIMITED_DATA_TYPE, value); } } conf.set(MarkLogicConstants.CONTENT_TYPE, contentType.name()); if (ContentType.MIXED == contentType) { LOG.info("Content type is set to MIXED. The format of the " + " inserted documents will be determined by the MIME " + " type specification configured on MarkLogic Server."); } else { LOG.info("Content type: " + contentType.name()); } if (Command.isStreaming(cmdline, conf)) { conf.setBoolean(MarkLogicConstants.OUTPUT_STREAMING, true); } if (cmdline.hasOption(ARCHIVE_METADATA_OPTIONAL)) { String arg = cmdline.getOptionValue(ARCHIVE_METADATA_OPTIONAL); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_INPUT_ARCHIVE_METADATA_OPTIONAL, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(CONF_INPUT_ARCHIVE_METADATA_OPTIONAL, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + ARCHIVE_METADATA_OPTIONAL + ": " + arg); } } if (cmdline.hasOption(INPUT_COMPRESSION_CODEC)) { String codec = cmdline.getOptionValue(INPUT_COMPRESSION_CODEC); conf.set(CONF_INPUT_COMPRESSION_CODEC, codec.toUpperCase()); } if (cmdline.hasOption(MAX_SPLIT_SIZE)) { String maxSize = cmdline.getOptionValue(MAX_SPLIT_SIZE); conf.set(CONF_MAX_SPLIT_SIZE1, maxSize); conf.set(CONF_MAX_SPLIT_SIZE2, maxSize); } if (cmdline.hasOption(MIN_SPLIT_SIZE)) { String minSize = cmdline.getOptionValue(MIN_SPLIT_SIZE); conf.set(CONF_MIN_SPLIT_SIZE1, minSize); conf.set(CONF_MIN_SPLIT_SIZE2, minSize); } if (cmdline.hasOption(AGGREGATE_RECORD_ELEMENT)) { String recElem = cmdline.getOptionValue(AGGREGATE_RECORD_ELEMENT); conf.set(CONF_AGGREGATE_RECORD_ELEMENT, recElem); } if (cmdline.hasOption(AGGREGATE_RECORD_NAMESPACE)) { String recNs = cmdline.getOptionValue(AGGREGATE_RECORD_NAMESPACE); conf.set(CONF_AGGREGATE_RECORD_NAMESPACE, recNs); } if (cmdline.hasOption(DELIMITER)) { String delim = cmdline.getOptionValue(DELIMITER); if (delim == null || delim.length() != 1) { throw new IllegalArgumentException("Invalid delimiter: " + delim); } conf.set(CONF_DELIMITER, delim); } if (cmdline.hasOption(DELIMITED_ROOT_NAME)) { String type = cmdline.getOptionValue(DOCUMENT_TYPE, ContentType.XML.name()); if ("JSON".equalsIgnoreCase(type)) { LOG.warn(DELIMITED_ROOT_NAME + " is only applicable when " + DOCUMENT_TYPE + " is " + ContentType.XML.name()); } else { String delimRoot = cmdline.getOptionValue(DELIMITED_ROOT_NAME); conf.set(CONF_DELIMITED_ROOT_NAME, delimRoot); } } if (cmdline.hasOption(OUTPUT_FILENAME_AS_COLLECTION)) { String arg = cmdline.getOptionValue(OUTPUT_FILENAME_AS_COLLECTION); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_OUTPUT_FILENAME_AS_COLLECTION, true); } else { conf.setBoolean(CONF_OUTPUT_FILENAME_AS_COLLECTION, false); } } if (cmdline.hasOption(OUTPUT_DIRECTORY)) { String outDir = cmdline.getOptionValue(OUTPUT_DIRECTORY); conf.set(MarkLogicConstants.OUTPUT_DIRECTORY, outDir); } if (cmdline.hasOption(OUTPUT_CLEANDIR)) { String arg = cmdline.getOptionValue(OUTPUT_CLEANDIR); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(MarkLogicConstants.OUTPUT_CLEAN_DIR, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(MarkLogicConstants.OUTPUT_CLEAN_DIR, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + OUTPUT_CLEANDIR + ": " + arg); } } if (cmdline.hasOption(NAMESPACE)) { String ns = cmdline.getOptionValue(NAMESPACE); conf.set(MarkLogicConstants.OUTPUT_CONTENT_NAMESPACE, ns); } if (cmdline.hasOption(OUTPUT_LANGUAGE)) { String language = cmdline.getOptionValue(OUTPUT_LANGUAGE); conf.set(MarkLogicConstants.OUTPUT_CONTENT_LANGUAGE, language); } if (cmdline.hasOption(INPUT_FILE_PATTERN)) { if (inputType == InputType.FOREST) { LOG.warn("The setting for " + INPUT_FILE_PATTERN + " is ignored for input type " + inputType.name()); } else { String pattern = cmdline.getOptionValue(INPUT_FILE_PATTERN); conf.set(CONF_INPUT_FILE_PATTERN, pattern); } } if (cmdline.hasOption(USERNAME)) { String username = cmdline.getOptionValue(USERNAME); conf.set(MarkLogicConstants.OUTPUT_USERNAME, username); } if (cmdline.hasOption(PASSWORD)) { String password = cmdline.getOptionValue(PASSWORD); conf.set(MarkLogicConstants.OUTPUT_PASSWORD, password); } if (cmdline.hasOption(HOST)) { String host = cmdline.getOptionValue(HOST); conf.set(MarkLogicConstants.OUTPUT_HOST, host); } if (cmdline.hasOption(PORT)) { String port = cmdline.getOptionValue(PORT); conf.set(MarkLogicConstants.OUTPUT_PORT, port); } if (cmdline.hasOption(DATABASE)) { String db = cmdline.getOptionValue(DATABASE); conf.set(MarkLogicConstants.OUTPUT_DATABASE_NAME, db); } if (cmdline.hasOption(TEMPORAL_COLLECTION)) { String tempColl = cmdline.getOptionValue(TEMPORAL_COLLECTION); conf.set(MarkLogicConstants.TEMPORAL_COLLECTION, tempColl); } String repairLevel = cmdline.getOptionValue(XML_REPAIR_LEVEL, MarkLogicConstants.DEFAULT_OUTPUT_XML_REPAIR_LEVEL); conf.set(MarkLogicConstants.OUTPUT_XML_REPAIR_LEVEL, repairLevel.toUpperCase()); if (cmdline.hasOption(INPUT_SEQUENCEFILE_KEY_CLASS)) { String keyClass = cmdline.getOptionValue(INPUT_SEQUENCEFILE_KEY_CLASS); conf.set(CONF_INPUT_SEQUENCEFILE_KEY_CLASS, keyClass); } if (cmdline.hasOption(INPUT_SEQUENCEFILE_VALUE_CLASS)) { String valueClass = cmdline.getOptionValue(INPUT_SEQUENCEFILE_VALUE_CLASS); conf.set(CONF_INPUT_SEQUENCEFILE_VALUE_CLASS, valueClass); } if (cmdline.hasOption(INPUT_SEQUENCEFILE_VALUE_TYPE)) { String valueType = cmdline.getOptionValue(INPUT_SEQUENCEFILE_VALUE_TYPE, DEFAULT_SEQUENCEFILE_VALUE_TYPE); conf.set(CONF_INPUT_SEQUENCEFILE_VALUE_TYPE, valueType.toUpperCase()); if (valueType.equalsIgnoreCase(SequenceFileValueType.BYTESWRITABLE.toString())) { conf.set(MarkLogicConstants.CONTENT_TYPE, ContentType.BINARY.toString()); } } else if (conf.get(CONF_INPUT_SEQUENCEFILE_VALUE_TYPE) == null) { conf.set(CONF_INPUT_SEQUENCEFILE_VALUE_TYPE, DEFAULT_SEQUENCEFILE_VALUE_TYPE); } if (cmdline.hasOption(INPUT_FILE_TYPE)) { String fileType = cmdline.getOptionValue(INPUT_FILE_TYPE); if (fileType.equalsIgnoreCase(InputType.ARCHIVE.toString())) { conf.set(MarkLogicConstants.CONTENT_TYPE, ContentType.UNKNOWN.toString()); } } if (cmdline.hasOption(FAST_LOAD)) { String arg = cmdline.getOptionValue(FAST_LOAD); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(MarkLogicConstants.OUTPUT_FAST_LOAD, true); LOG.info("Option fastload is specified.Please make sure " + "that all conditions required to run in fastload " + "mode are satisfied to avoid XDMP-DBDUPURI " + "errors."); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(MarkLogicConstants.OUTPUT_FAST_LOAD, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + FAST_LOAD + ": " + arg); } } if (cmdline.hasOption(CONTENT_ENCODING)) { String arg = cmdline.getOptionValue(CONTENT_ENCODING).toUpperCase(); if ("SYSTEM".equals(arg)) { arg = Charset.defaultCharset().name(); } else if (!Charset.isSupported(arg)) { throw new IllegalArgumentException(arg + " encoding is not supported"); } conf.set(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, arg); } if (cmdline.hasOption(THREADS_PER_SPLIT)) { String arg = cmdline.getOptionValue(THREADS_PER_SPLIT); int threadCnt = Integer.parseInt(arg); if (threadCnt > 1 && isStreaming(cmdline, conf)) { LOG.warn("The setting for " + THREADS_PER_SPLIT + " is ignored because streaming is enabled."); } else if (threadCnt < inputType.getMinThreads()) { throw new IllegalArgumentException( "Cannot set " + THREADS_PER_SPLIT + " to a value less than the minimum required " + " threads (" + inputType.getMinThreads() + ")for the job."); } else { conf.set(CONF_THREADS_PER_SPLIT, arg); } } if (cmdline.hasOption(THREAD_COUNT)) { String arg = cmdline.getOptionValue(THREAD_COUNT); int threadCnt = Integer.parseInt(arg); if (threadCnt < inputType.getMinThreads()) { throw new IllegalArgumentException( "Cannot set " + THREAD_COUNT + " to a value less than the minimum required " + " threads (" + inputType.getMinThreads() + ")for the job."); } } if (cmdline.hasOption(TEMPORAL_COLLECTION)) { String fileType = cmdline.getOptionValue(INPUT_FILE_TYPE); if (fileType != null && fileType.equalsIgnoreCase(InputType.RDF.toString())) { throw new IllegalArgumentException("Cannot ingest RDF into temporal collection"); } if (contentType != null && ContentType.BINARY == contentType) { throw new IllegalArgumentException("Cannot ingest BINARY into temporal collection"); } } if (cmdline.hasOption(TOLERATE_ERRORS)) { String arg = cmdline.getOptionValue(TOLERATE_ERRORS); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(MarkLogicConstants.OUTPUT_TOLERATE_ERRORS, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(MarkLogicConstants.OUTPUT_TOLERATE_ERRORS, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + TOLERATE_ERRORS + ": " + arg); } } applyPartitionConfigOptions(conf, cmdline); applyModuleConfigOptions(conf, cmdline); if (cmdline.hasOption(SPLIT_INPUT)) { String arg = cmdline.getOptionValue(SPLIT_INPUT); if (arg == null || arg.equalsIgnoreCase("true")) { if (isInputCompressed(cmdline)) { LOG.warn(INPUT_COMPRESSED + " disables " + SPLIT_INPUT); conf.setBoolean(CONF_SPLIT_INPUT, false); } if (inputType != InputType.DELIMITED_TEXT) { throw new IllegalArgumentException( "The setting for " + SPLIT_INPUT + " option is not supported for " + inputType); } conf.setBoolean(CONF_SPLIT_INPUT, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(CONF_SPLIT_INPUT, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + SPLIT_INPUT + ": " + arg); } } if (cmdline.hasOption(COLLECTION_FILTER)) { if (inputType == InputType.FOREST) { String colFilter = cmdline.getOptionValue(COLLECTION_FILTER); conf.set(MarkLogicConstants.COLLECTION_FILTER, colFilter); } else { LOG.warn("The setting for " + COLLECTION_FILTER + " is not applicable for " + inputType); } } if (cmdline.hasOption(DIRECTORY_FILTER)) { if (inputType == InputType.FOREST) { String dirFilter = cmdline.getOptionValue(DIRECTORY_FILTER); conf.set(MarkLogicConstants.DIRECTORY_FILTER, dirFilter); } else { LOG.warn("The setting for " + DIRECTORY_FILTER + " is not applicable for " + inputType); } } if (cmdline.hasOption(TYPE_FILTER)) { if (inputType == InputType.FOREST) { String typeFilter = cmdline.getOptionValue(TYPE_FILTER); conf.set(MarkLogicConstants.TYPE_FILTER, typeFilter); } else { LOG.warn("The setting for " + TYPE_FILTER + " is not applicable for " + inputType); } } } @Override public void setMapperClass(Job job, Configuration conf, CommandLine cmdline) { String inputTypeOption = cmdline.getOptionValue(INPUT_FILE_TYPE, INPUT_FILE_TYPE_DEFAULT); InputType type = InputType.forName(inputTypeOption); int minThreads = type.getMinThreads(); if (minThreads > 1) { conf.setInt(CONF_MIN_THREADS, minThreads); } int threadCnt = conf.getInt(CONF_THREADS_PER_SPLIT, 1); threadCnt = Math.max(threadCnt, minThreads); Class<? extends BaseMapper<?, ?, ?, ?>> internalMapperClass = type.getMapperClass(cmdline, conf); if (threadCnt > 1 && !isStreaming(cmdline, conf)) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job.getConfiguration(), internalMapperClass); MultithreadedMapper.setNumberOfThreads(job, threadCnt); } else { // thread_count_per_split is not greater than 1 job.setMapperClass(internalMapperClass); } } @SuppressWarnings("unchecked") @Override public Class<? extends Mapper<?, ?, ?, ?>> getRuntimeMapperClass(Job job, Class<? extends Mapper<?, ?, ?, ?>> mapper, int threadCnt, int availableThreads) { if (threadCnt == 0 && availableThreads > 1 && !job.getConfiguration().getBoolean(MarkLogicConstants.OUTPUT_STREAMING, false)) { Class<? extends Mapper<?, ?, ?, ?>> mapperClass = (Class<? extends Mapper<?, ?, ?, ?>>) (Class) MultithreadedMapper.class; MultithreadedMapper.setMapperClass(job.getConfiguration(), (Class<? extends BaseMapper<?, ?, ?, ?>>) mapper); return mapperClass; } else { return mapper; } } }, EXPORT { @Override public void configOptions(Options options) { configCommonOptions(options); configConnectionId(options); configCopyOptions(options); configFilteringOptions(options); configRedactionOptions(options); Option outputType = OptionBuilder.withArgName("type").hasArg().withDescription("export output type") .create(OUTPUT_TYPE); options.addOption(outputType); Option outputFilePath = OptionBuilder.withArgName("path").hasArg() .withDescription("export output file path").create(OUTPUT_FILE_PATH); outputFilePath.setRequired(true); options.addOption(outputFilePath); Option exportCompress = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to compress the output document").create(OUTPUT_COMPRESS); options.addOption(exportCompress); Option exportIndented = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to format XML data with indentation").create(OUTPUT_INDENTED); options.addOption(exportIndented); Option snapshot = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription( "Whether to use a consistent timestamp to " + "fetch data from the source database") .create(SNAPSHOT); options.addOption(snapshot); Option encoding = OptionBuilder.withArgName("encoding").hasOptionalArg() .withDescription( "The charset encoding to be used by the MarkLogic when " + "exporting documents") .create(CONTENT_ENCODING); options.addOption(encoding); } @Override public Job createJob(Configuration conf, CommandLine cmdline) throws IOException { applyConfigOptions(conf, cmdline); String type = conf.get(CONF_OUTPUT_TYPE, DEFAULT_OUTPUT_TYPE); ExportOutputType outputType = ExportOutputType.valueOf(type.toUpperCase()); if (outputType.equals(ExportOutputType.DOCUMENT)) { conf.set(MarkLogicConstants.INPUT_VALUE_CLASS, DatabaseDocument.class.getCanonicalName()); } if (cmdline.hasOption(SNAPSHOT)) { String arg = cmdline.getOptionValue(SNAPSHOT); if (arg == null || arg.equalsIgnoreCase("true")) { setQueryTimestamp(conf); } else if (!arg.equalsIgnoreCase("false")) { throw new IllegalArgumentException("Unrecognized option argument for " + SNAPSHOT + ": " + arg); } } // construct a job Job job = Job.getInstance(conf); job.setJarByClass(this.getClass()); job.setInputFormatClass(outputType.getInputFormatClass()); setMapperClass(job, conf, cmdline); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(MarkLogicDocument.class); job.setOutputFormatClass(outputType.getOutputFormatClass(cmdline)); job.setOutputKeyClass(DocumentURI.class); job.setJobName(getNewJobName(conf)); AuditUtil.prepareAuditMlcpStart(job, this.name(), cmdline); return job; } @Override public void applyConfigOptions(Configuration conf, CommandLine cmdline) { applyCopyConfigOptions(conf, cmdline); applyFilteringConfigOptions(conf, cmdline); applyRedactionConfigOptions(conf, cmdline); if (cmdline.hasOption(OUTPUT_TYPE)) { String outputType = cmdline.getOptionValue(OUTPUT_TYPE); conf.set(CONF_OUTPUT_TYPE, outputType); } if (cmdline.hasOption(OUTPUT_FILE_PATH)) { String path = cmdline.getOptionValue(OUTPUT_FILE_PATH); String wkdir = conf.get(CONF_MAPREDUCE_JOB_WORKING_DIR); if (wkdir != null) { path = new Path(wkdir, path).toString(); } conf.set(CONF_OUTPUT_FILEPATH, path); } if (cmdline.hasOption(OUTPUT_INDENTED)) { String isIndented = cmdline.getOptionValue(OUTPUT_INDENTED); // check value validity if (isIndented != null) { Indentation indent = Indentation.forName(isIndented); conf.set(MarkLogicConstants.INDENTED, indent.name()); } } if (cmdline.hasOption(HOST)) { String host = cmdline.getOptionValue(HOST); conf.set(MarkLogicConstants.INPUT_HOST, host); } if (cmdline.hasOption(PORT)) { String port = cmdline.getOptionValue(PORT); conf.set(MarkLogicConstants.INPUT_PORT, port); } if (cmdline.hasOption(USERNAME)) { String user = cmdline.getOptionValue(USERNAME); conf.set(MarkLogicConstants.INPUT_USERNAME, user); } if (cmdline.hasOption(PASSWORD)) { String pswd = cmdline.getOptionValue(PASSWORD); conf.set(MarkLogicConstants.INPUT_PASSWORD, pswd); } if (cmdline.hasOption(DATABASE)) { String db = cmdline.getOptionValue(DATABASE); conf.set(MarkLogicConstants.INPUT_DATABASE_NAME, db); } if (cmdline.hasOption(MAX_SPLIT_SIZE)) { String maxSize = cmdline.getOptionValue(MAX_SPLIT_SIZE); conf.set(MarkLogicConstants.MAX_SPLIT_SIZE, maxSize); } if (cmdline.hasOption(CONTENT_ENCODING)) { String arg = cmdline.getOptionValue(CONTENT_ENCODING).toUpperCase(); if ("SYSTEM".equals(arg)) { arg = Charset.defaultCharset().name(); } else if (!Charset.isSupported(arg)) { throw new IllegalArgumentException(arg + " encoding is not supported"); } conf.set(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, arg); } } @Override public void setMapperClass(Job job, Configuration conf, CommandLine cmdline) { job.setMapperClass(DocumentMapper.class); } @Override public Class<? extends Mapper<?, ?, ?, ?>> getRuntimeMapperClass(Job job, Class<? extends Mapper<?, ?, ?, ?>> mapper, int threadCnt, int availableThreads) { return mapper; } }, COPY { @Override public void configOptions(Options options) { configCommonOptions(options); configCopyOptions(options); configCommonOutputOptions(options); configFilteringOptions(options); configBatchTxn(options); configModule(options); configRedactionOptions(options); Option inputUsername = OptionBuilder.withArgName("username").hasArg() .withDescription("User name of the input MarkLogic Server").create(INPUT_USERNAME); options.addOption(inputUsername); Option inputPassword = OptionBuilder.withArgName("password").hasArg() .withDescription("Password of the input MarkLogic Server").create(INPUT_PASSWORD); options.addOption(inputPassword); Option inputHost = OptionBuilder.withArgName("host").hasArg() .withDescription("Host of the input MarkLogic Server").create(INPUT_HOST); inputHost.setRequired(true); options.addOption(inputHost); Option inputPort = OptionBuilder.withArgName("port").hasArg() .withDescription("Port of the input MarkLogic Server").create(INPUT_PORT); options.addOption(inputPort); Option inputDB = OptionBuilder.withArgName("database").hasArg() .withDescription("Database of the input MarkLogic Server").create(INPUT_DATABASE); options.addOption(inputDB); Option outputUsername = OptionBuilder.withArgName("username").hasArg() .withDescription("User Name of the output MarkLogic Server").create(OUTPUT_USERNAME); options.addOption(outputUsername); Option outputPassword = OptionBuilder.withArgName("password").hasArg() .withDescription("Password of the output MarkLogic Server").create(OUTPUT_PASSWORD); options.addOption(outputPassword); Option outputHost = OptionBuilder.withArgName("host").hasArg() .withDescription("Host of the output MarkLogic Server").create(OUTPUT_HOST); outputHost.setRequired(true); options.addOption(outputHost); Option outputPort = OptionBuilder.withArgName("port").hasArg() .withDescription("Port of the output MarkLogic Server").create(OUTPUT_PORT); options.addOption(outputPort); Option outputDB = OptionBuilder.withArgName("database").hasArg() .withDescription("Database of the output MarkLogic Server").create(OUTPUT_DATABASE); options.addOption(outputDB); Option tcf = OptionBuilder.withArgName("String").hasArg() .withDescription("temporal collection name, used only" + " for temporal documents") .create(TEMPORAL_COLLECTION); options.addOption(tcf); Option fastLoad = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to use the fast load mode to load " + "content into MarkLogic") .create(FAST_LOAD); options.addOption(fastLoad); Option outputDir = OptionBuilder.withArgName("directory").hasArg() .withDescription("Output directory in MarkLogic.").create(OUTPUT_DIRECTORY); options.addOption(outputDir); Option tolerateErrors = OptionBuilder.withArgName("tolerate errors").hasOptionalArg().withDescription( "Whether to tolerate insertion errors and make sure all " + "successful inserts are committed") .create(TOLERATE_ERRORS); options.addOption(tolerateErrors); configPartition(options); Option snapshot = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription( "Whether to use a consistent timestamp to " + "fetch data from the source database") .create(SNAPSHOT); options.addOption(snapshot); } @Override public Job createJob(Configuration conf, CommandLine cmdline) throws IOException { applyConfigOptions(conf, cmdline); if (cmdline.hasOption(SNAPSHOT)) { String arg = cmdline.getOptionValue(SNAPSHOT); if (arg == null || arg.equalsIgnoreCase("true")) { setQueryTimestamp(conf); } else if (!arg.equalsIgnoreCase("false")) { throw new IllegalArgumentException("Unrecognized option argument for " + SNAPSHOT + ": " + arg); } } Job job = Job.getInstance(conf); job.setJarByClass(this.getClass()); job.setInputFormatClass(DatabaseContentInputFormat.class); job.setMapperClass(DocumentMapper.class); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(MarkLogicDocument.class); if (cmdline.hasOption(TRANSFORM_MODULE)) { job.setOutputFormatClass(DatabaseTransformOutputFormat.class); } else { job.setOutputFormatClass(DatabaseContentOutputFormat.class); } job.setOutputKeyClass(DocumentURI.class); job.setJobName(getNewJobName(conf)); AuditUtil.prepareAuditMlcpStart(job, this.name(), cmdline); return job; } @Override public void applyConfigOptions(Configuration conf, CommandLine cmdline) { applyCopyConfigOptions(conf, cmdline); applyFilteringConfigOptions(conf, cmdline); applyCommonOutputConfigOptions(conf, cmdline); applyRedactionConfigOptions(conf, cmdline); if (cmdline.hasOption(OUTPUT_USERNAME)) { String username = cmdline.getOptionValue(OUTPUT_USERNAME); conf.set(MarkLogicConstants.OUTPUT_USERNAME, username); } if (cmdline.hasOption(OUTPUT_PASSWORD)) { String password = cmdline.getOptionValue(OUTPUT_PASSWORD); conf.set(MarkLogicConstants.OUTPUT_PASSWORD, password); } if (cmdline.hasOption(OUTPUT_HOST)) { String host = cmdline.getOptionValue(OUTPUT_HOST); conf.set(MarkLogicConstants.OUTPUT_HOST, host); } if (cmdline.hasOption(OUTPUT_PORT)) { String port = cmdline.getOptionValue(OUTPUT_PORT); conf.set(MarkLogicConstants.OUTPUT_PORT, port); } if (cmdline.hasOption(OUTPUT_DATABASE)) { String db = cmdline.getOptionValue(OUTPUT_DATABASE); conf.set(MarkLogicConstants.OUTPUT_DATABASE_NAME, db); } if (cmdline.hasOption(INPUT_USERNAME)) { String username = cmdline.getOptionValue(INPUT_USERNAME); conf.set(MarkLogicConstants.INPUT_USERNAME, username); } if (cmdline.hasOption(INPUT_PASSWORD)) { String password = cmdline.getOptionValue(INPUT_PASSWORD); conf.set(MarkLogicConstants.INPUT_PASSWORD, password); } if (cmdline.hasOption(INPUT_HOST)) { String host = cmdline.getOptionValue(INPUT_HOST); conf.set(MarkLogicConstants.INPUT_HOST, host); } if (cmdline.hasOption(INPUT_PORT)) { String port = cmdline.getOptionValue(INPUT_PORT); conf.set(MarkLogicConstants.INPUT_PORT, port); } if (cmdline.hasOption(INPUT_DATABASE)) { String db = cmdline.getOptionValue(INPUT_DATABASE); conf.set(MarkLogicConstants.INPUT_DATABASE_NAME, db); } if (cmdline.hasOption(TEMPORAL_COLLECTION)) { String tempColl = cmdline.getOptionValue(TEMPORAL_COLLECTION); conf.set(MarkLogicConstants.TEMPORAL_COLLECTION, tempColl); } if (cmdline.hasOption(MAX_SPLIT_SIZE)) { String maxSize = cmdline.getOptionValue(MAX_SPLIT_SIZE); conf.set(MarkLogicConstants.MAX_SPLIT_SIZE, maxSize); } if (cmdline.hasOption(FAST_LOAD)) { String arg = cmdline.getOptionValue(FAST_LOAD); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(MarkLogicConstants.OUTPUT_FAST_LOAD, true); } else { conf.setBoolean(MarkLogicConstants.OUTPUT_FAST_LOAD, false); } } if (cmdline.hasOption(OUTPUT_DIRECTORY)) { String outDir = cmdline.getOptionValue(OUTPUT_DIRECTORY); conf.set(MarkLogicConstants.OUTPUT_DIRECTORY, outDir); } if (cmdline.hasOption(TOLERATE_ERRORS)) { String arg = cmdline.getOptionValue(TOLERATE_ERRORS); conf.set(MarkLogicConstants.OUTPUT_TOLERATE_ERRORS, arg); } if (cmdline.hasOption(TEMPORAL_COLLECTION)) { InputType inputType = getInputType(cmdline); String fileType = cmdline.getOptionValue(INPUT_FILE_TYPE); ContentType contentType = inputType.getContentType(cmdline); if (fileType != null && fileType.equalsIgnoreCase(InputType.RDF.toString())) { throw new IllegalArgumentException("Cannot ingest RDF into temporal collection"); } if (contentType != null && ContentType.BINARY == contentType) { throw new IllegalArgumentException("Cannot ingest BINARY into temporal collection"); } } applyPartitionConfigOptions(conf, cmdline); applyModuleConfigOptions(conf, cmdline); } @Override public void setMapperClass(Job job, Configuration conf, CommandLine cmdline) { job.setMapperClass(DocumentMapper.class); } @Override public Class<? extends Mapper<?, ?, ?, ?>> getRuntimeMapperClass(Job job, Class<? extends Mapper<?, ?, ?, ?>> mapper, int threadCnt, int availableThreads) { return mapper; } }, EXTRACT { @Override public void applyConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(OUTPUT_FILE_PATH)) { String path = cmdline.getOptionValue(OUTPUT_FILE_PATH); String wkdir = conf.get(CONF_MAPREDUCE_JOB_WORKING_DIR); if (wkdir != null) { path = new Path(wkdir, path).toString(); } conf.set(CONF_OUTPUT_FILEPATH, path); } if (cmdline.hasOption(MIN_SPLIT_SIZE)) { String minSize = cmdline.getOptionValue(MIN_SPLIT_SIZE); conf.set(CONF_MIN_SPLIT_SIZE1, minSize); conf.set(CONF_MIN_SPLIT_SIZE2, minSize); } if (cmdline.hasOption(MAX_SPLIT_SIZE)) { String maxSize = cmdline.getOptionValue(MAX_SPLIT_SIZE); conf.set(CONF_MAX_SPLIT_SIZE1, maxSize); conf.set(CONF_MAX_SPLIT_SIZE2, maxSize); } if (cmdline.hasOption(COLLECTION_FILTER)) { String colFilter = cmdline.getOptionValue(COLLECTION_FILTER); conf.set(MarkLogicConstants.COLLECTION_FILTER, colFilter); } if (cmdline.hasOption(DIRECTORY_FILTER)) { String dirFilter = cmdline.getOptionValue(DIRECTORY_FILTER); conf.set(MarkLogicConstants.DIRECTORY_FILTER, dirFilter); } if (cmdline.hasOption(TYPE_FILTER)) { String typeFilter = cmdline.getOptionValue(TYPE_FILTER); conf.set(MarkLogicConstants.TYPE_FILTER, typeFilter); } } @Override public void configOptions(Options options) { configCommonOptions(options); Option inputFilePath = OptionBuilder.withArgName("path").hasArg() .withDescription("The file system location for input, as a " + "regular expression") .create(INPUT_FILE_PATH); inputFilePath.setRequired(true); options.addOption(inputFilePath); Option outputFilePath = OptionBuilder.withArgName("path").hasArg() .withDescription("export output file path").create(OUTPUT_FILE_PATH); outputFilePath.setRequired(true); options.addOption(outputFilePath); Option exportCompress = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to compress the output document").create(OUTPUT_COMPRESS); options.addOption(exportCompress); Option df = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of directories").create(DIRECTORY_FILTER); options.addOption(df); Option cf = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of collections").create(COLLECTION_FILTER); options.addOption(cf); Option tf = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of document types").create(TYPE_FILTER); options.addOption(tf); } @Override public Job createJob(Configuration conf, CommandLine cmdline) throws IOException { applyConfigOptions(conf, cmdline); // construct a job Job job = Job.getInstance(conf); job.setInputFormatClass(ForestInputFormat.class); Class<? extends OutputFormat> outputFormatClass = Command.isOutputCompressed(cmdline) ? ArchiveOutputFormat.class : SingleDocumentOutputFormat.class; job.setOutputFormatClass(outputFormatClass); setMapperClass(job, conf, cmdline); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(ForestDocument.class); job.setJobName(getNewJobName(conf)); if (cmdline.hasOption(INPUT_FILE_PATH)) { String path = cmdline.getOptionValue(INPUT_FILE_PATH); FileInputFormat.setInputPaths(job, path); } return job; } @Override public Class<? extends Mapper<?, ?, ?, ?>> getRuntimeMapperClass(Job job, Class<? extends Mapper<?, ?, ?, ?>> mapper, int threadCnt, int availableThreads) { return mapper; } @Override public void setMapperClass(Job job, Configuration conf, CommandLine cmdline) { job.setMapperClass(DocumentMapper.class); } }; public static final Log LOG = LogFactory.getLog(LocalJobRunner.class); private static int jobid = 0; private static Random rand = new Random(); public static Command forName(String cmd) { if (cmd.equalsIgnoreCase(IMPORT.name())) { return IMPORT; } else if (cmd.equalsIgnoreCase(EXPORT.name())) { return EXPORT; } else if (cmd.equalsIgnoreCase(COPY.name())) { return COPY; } else if (cmd.equalsIgnoreCase(EXTRACT.name())) { return EXTRACT; } else { throw new IllegalArgumentException("Unknown command: " + cmd); } } protected static boolean isInputCompressed(CommandLine cmdline) { if (cmdline.hasOption(INPUT_COMPRESSED)) { String isCompress = cmdline.getOptionValue(INPUT_COMPRESSED); if (isCompress == null || isCompress.equalsIgnoreCase("true")) { return true; } } return false; } protected static boolean isOutputCompressed(CommandLine cmdline) { if (cmdline.hasOption(OUTPUT_COMPRESS)) { String isCompress = cmdline.getOptionValue(OUTPUT_COMPRESS); if (isCompress == null || isCompress.equalsIgnoreCase("true")) { return true; } } return false; } protected static boolean isStreaming(CommandLine cmdline, Configuration conf) { if (conf.get(MarkLogicConstants.OUTPUT_STREAMING) != null) { return conf.getBoolean(MarkLogicConstants.OUTPUT_STREAMING, false); } String arg = null; if (cmdline.hasOption(STREAMING)) { arg = cmdline.getOptionValue(STREAMING); if (arg == null || arg.equalsIgnoreCase("true")) { InputType inputType = getInputType(cmdline); if (inputType != InputType.DOCUMENTS) { LOG.warn("Streaming option is not applicable to input " + "type " + inputType); conf.setBoolean(MarkLogicConstants.OUTPUT_STREAMING, false); return false; } else { conf.setBoolean(MarkLogicConstants.OUTPUT_STREAMING, true); return true; } } } conf.setBoolean(MarkLogicConstants.OUTPUT_STREAMING, false); return false; } /** * Add supported config options. * * @param options */ public abstract void configOptions(Options options); /** * Create a job based on Hadoop configuration and options. * * @param conf * Hadoop configuration * @param options * options * @return a Hadoop job * @throws Exception */ public abstract Job createJob(Configuration conf, CommandLine cmdline) throws IOException; /** * Apply config options set from the command-line to the configuration. * * @param conf * configuration * @param cmdline * command line options */ public abstract void applyConfigOptions(Configuration conf, CommandLine cmdline); /** * Set Mapper class for a job. If the minimum threads required is more * than 1 and threads_per_split is not set, also set the minimum threads in * the configuration for the job scheduler. * * @param job the Hadoop job * @param conf Hadoop configuration * @param CommandLine command line */ public abstract void setMapperClass(Job job, Configuration conf, CommandLine cmdline); public abstract Class<? extends Mapper<?, ?, ?, ?>> getRuntimeMapperClass(Job job, Class<? extends Mapper<?, ?, ?, ?>> mapper, int threadCnt, int availableThreads); static void setQueryTimestamp(Configuration conf) throws IOException { try { ContentSource cs = InternalUtilities.getInputContentSource(conf); Session session = cs.newSession(); conf.set(MarkLogicConstants.INPUT_QUERY_TIMESTAMP, session.getCurrentServerPointInTime().toString()); } catch (Exception ex) { throw new IOException("Error getting query timestamp", ex); } } static void configRedactionOptions(Options options) { Option redaction = OptionBuilder.withArgName("redaction rules").hasArg() .withDescription("Comma separated list of redaction rule collection URIs").create(REDACTION); options.addOption(redaction); } static void configCommonOptions(Options options) { Option mode = OptionBuilder.withArgName(MODE).hasArg() .withDescription("Whether to run in local or distributed mode.").create(MODE); options.addOption(mode); Option hadoopConfDir = OptionBuilder.withArgName("directory").hasArg() .withDescription("Override $HADOOP_CONF_DIR").create(HADOOP_CONF_DIR); options.addOption(hadoopConfDir); Option threadCount = OptionBuilder.withArgName("count").hasArg().withDescription("Number of threads") .create(THREAD_COUNT); options.addOption(threadCount); Option maxSplitSize = OptionBuilder.withArgName("number").hasArg() .withDescription("Maximum number of MarkLogic documents per each " + "input split in export or copy, or maximum number of " + "bytes in file per each split in import") .create(MAX_SPLIT_SIZE); options.addOption(maxSplitSize); Option minSplitSize = OptionBuilder.withArgName("number").hasArg() .withDescription("Minimum number of bytes in file per each split " + "in import") .create(MIN_SPLIT_SIZE); options.addOption(minSplitSize); } static void configRDFGraphOutputOptions(Options options) { Option outputGraph = OptionBuilder.withArgName("graph").hasArg().withDescription("Default graph for quad") .create(OUTPUT_GRAPH); options.addOption(outputGraph); Option outputOverrideGraph = OptionBuilder.withArgName("graph").hasArg() .withDescription("Graph overrided for quad").create(OUTPUT_OVERRIDE_GRAPH); options.addOption(outputOverrideGraph); } static void configCommonOutputOptions(Options options) { Option outputUriReplace = OptionBuilder.withArgName("list").hasArg() .withDescription("Comma separated list of regex pattern and " + "string pairs, 1st to match a uri segment, 2nd the " + "string to replace with, with the 2nd one in ''") .create(OUTPUT_URI_REPLACE); options.addOption(outputUriReplace); Option outputUriPrefix = OptionBuilder.withArgName("prefix").hasArg() .withDescription("String to prepend to all document URIs").create(OUTPUT_URI_PREFIX); options.addOption(outputUriPrefix); Option outputUriSuffix = OptionBuilder.withArgName("suffix").hasArg() .withDescription("String to append to all document URIs").create(OUTPUT_URI_SUFFIX); options.addOption(outputUriSuffix); Option outputCollections = OptionBuilder.withArgName("collections").hasArg() .withDescription("Comma separated list of collection to be applied" + " to output documents") .create(OUTPUT_COLLECTIONS); options.addOption(outputCollections); Option outputPermissions = OptionBuilder.withArgName("permissions").hasArg() .withDescription( "Comma separated list of user-privilege pairs to " + "be applied to output documents") .create(OUTPUT_PERMISSIONS); options.addOption(outputPermissions); Option outputQuantity = OptionBuilder.withArgName("quality").hasArg() .withDescription("Quality to be applied to output documents").create(OUTPUT_QUALITY); options.addOption(outputQuantity); } static void configConnectionId(Options options) { Option username = OptionBuilder.withArgName(USERNAME).hasArg() .withDescription("User name of MarkLogic Server").create(USERNAME); options.addOption(username); Option password = OptionBuilder.withArgName(PASSWORD).hasArg() .withDescription("Password of MarkLogic Server").create(PASSWORD); options.addOption(password); Option host = OptionBuilder.withArgName(HOST).hasArg().withDescription("Host of MarkLogic Server") .create(HOST); host.setRequired(true); options.addOption(host); Option port = OptionBuilder.withArgName(PORT).hasArg().withDescription("Port of MarkLogic Server") .create(PORT); options.addOption(port); Option db = OptionBuilder.withArgName(DATABASE).hasArg().withDescription("Database of MarkLogic Server") .create(DATABASE); options.addOption(db); } static void configCopyOptions(Options options) { Option cpcol = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to copy document collections from source" + " to destination") .create(COPY_COLLECTIONS); options.addOption(cpcol); Option cppm = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to copy document permissions from source" + " to destination") .create(COPY_PERMISSIONS); options.addOption(cppm); Option cppt = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to copy document properties from source" + " to destination") .create(COPY_PROPERTIES); options.addOption(cppt); Option cpqt = OptionBuilder.withArgName("true,false").hasOptionalArg() .withDescription("Whether to copy document quality from source" + " to destination") .create(COPY_QUALITY); options.addOption(cpqt); } static void configBatchTxn(Options options) { Option batchSize = OptionBuilder.withArgName("number").hasArg() .withDescription("Number of documents in one request (default 100)").create(BATCH_SIZE); options.addOption(batchSize); Option txnSize = OptionBuilder.withArgName("number").hasArg() .withDescription("Number of requests in one transaction (default 10)").create(TRANSACTION_SIZE); options.addOption(txnSize); } static void configModule(Options options) { Option moduleUri = OptionBuilder.withArgName("String").hasArg() .withDescription("Path to the module containing the transform function").create(TRANSFORM_MODULE); options.addOption(moduleUri); Option ns = OptionBuilder.withArgName("String").hasArg() .withDescription("Namespace of the transform function").create(TRANSFORM_NAMESPACE); options.addOption(ns); Option func = OptionBuilder.withArgName("String").hasArg().withDescription("Name of the transform function") .create(TRANSFORM_FUNCTION); options.addOption(func); Option param = OptionBuilder.withArgName("String").hasArg() .withDescription("Name of the transform function").create(TRANSFORM_PARAM); options.addOption(param); } static void configPartition(Options options) { Option partition = OptionBuilder.withArgName("partition name").hasArg() .withDescription("The partition where docs are inserted").create(OUTPUT_PARTITION); options.addOption(partition); } static void configFilteringOptions(Options options) { Option df = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of directories").create(DIRECTORY_FILTER); options.addOption(df); Option cf = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of collections").create(COLLECTION_FILTER); options.addOption(cf); Option ds = OptionBuilder.withArgName("String").hasArg() .withDescription("Path expression used to retrieve documents or " + "element nodes from the server") .create(DOCUMENT_SELECTOR); options.addOption(ds); Option ns = OptionBuilder.withArgName("String").hasArg() .withDescription("Comma-separated list of alias-URI bindings " + "used in document_selector") .create(PATH_NAMESPACE); options.addOption(ns); Option qf = OptionBuilder.withArgName("String").hasArg() .withDescription("cts query to retrieve documents with").create(QUERY_FILTER); options.addOption(qf); } static void applyModuleConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(TRANSFORM_MODULE)) { applyBatchTxn(conf, cmdline, 1); if (conf.getBoolean(MarkLogicConstants.OUTPUT_STREAMING, false) == true) { throw new UnsupportedOperationException("Server-side transformation can't work with streaming"); } String arg = cmdline.getOptionValue(TRANSFORM_MODULE); conf.set(CONF_TRANSFORM_MODULE, arg); if (cmdline.hasOption(TRANSFORM_NAMESPACE)) { arg = cmdline.getOptionValue(TRANSFORM_NAMESPACE); conf.set(CONF_TRANSFORM_NAMESPACE, arg); } if (cmdline.hasOption(TRANSFORM_FUNCTION)) { arg = cmdline.getOptionValue(TRANSFORM_FUNCTION); conf.set(CONF_TRANSFORM_FUNCTION, arg); } if (cmdline.hasOption(TRANSFORM_PARAM)) { arg = cmdline.getOptionValue(TRANSFORM_PARAM); conf.set(CONF_TRANSFORM_PARAM, arg); } } else { applyBatchTxn(conf, cmdline, MAX_BATCH_SIZE); } } static void applyPartitionConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(OUTPUT_PARTITION)) { String arg = cmdline.getOptionValue(OUTPUT_PARTITION); conf.set(MarkLogicConstants.OUTPUT_PARTITION, arg); } } static void applyCopyConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(COPY_COLLECTIONS)) { String arg = cmdline.getOptionValue(COPY_COLLECTIONS); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_COPY_COLLECTIONS, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(CONF_COPY_COLLECTIONS, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + COPY_COLLECTIONS + ": " + arg); } } else { conf.set(CONF_COPY_COLLECTIONS, DEFAULT_COPY_COLLECTIONS); } if (cmdline.hasOption(COPY_PERMISSIONS)) { String arg = cmdline.getOptionValue(COPY_PERMISSIONS); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_COPY_PERMISSIONS, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(CONF_COPY_PERMISSIONS, false); } else { throw new IllegalArgumentException( "Unrecognized option argument for " + COPY_PERMISSIONS + ": " + arg); } } else { conf.set(CONF_COPY_PERMISSIONS, DEFAULT_COPY_PERMISSIONS); } if (cmdline.hasOption(COPY_PROPERTIES)) { String arg = cmdline.getOptionValue(COPY_PROPERTIES); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_COPY_PROPERTIES, true); } else { conf.setBoolean(CONF_COPY_PROPERTIES, false); } } else { conf.set(CONF_COPY_PROPERTIES, DEFAULT_COPY_PROPERTIES); } if (cmdline.hasOption(COPY_QUALITY)) { String arg = cmdline.getOptionValue(COPY_QUALITY); if (arg == null || arg.equalsIgnoreCase("true")) { conf.setBoolean(CONF_COPY_QUALITY, true); } else if (arg.equalsIgnoreCase("false")) { conf.setBoolean(CONF_COPY_QUALITY, false); } else { throw new IllegalArgumentException("Unrecognized option argument for " + COPY_QUALITY + ": " + arg); } } else { conf.set(CONF_COPY_QUALITY, DEFAULT_COPY_QUALITY); } } static void applyFilteringConfigOptions(Configuration conf, CommandLine cmdline) { int filters = cmdline.hasOption(COLLECTION_FILTER) ? 1 : 0; filters += cmdline.hasOption(DIRECTORY_FILTER) ? 1 : 0; filters += cmdline.hasOption(DOCUMENT_SELECTOR) ? 1 : 0; if (filters > 1) { LOG.error("Only one of " + COLLECTION_FILTER + ", " + DIRECTORY_FILTER + " and " + DOCUMENT_SELECTOR + " can be specified."); throw new IllegalArgumentException("Only one of " + COLLECTION_FILTER + ", " + DIRECTORY_FILTER + ", " + QUERY_FILTER + " and " + DOCUMENT_SELECTOR + " can be specified."); } if (cmdline.hasOption(COLLECTION_FILTER)) { String c = cmdline.getOptionValue(COLLECTION_FILTER); String[] cf = c.split(","); if (cf.length > 1) { StringBuilder sb = new StringBuilder("("); for (int i = 0; i < cf.length; i++) { if (i > 0) { sb.append(","); } sb.append("\""); sb.append(cf[i]); sb.append("\""); } sb.append(")"); conf.set(MarkLogicConstants.COLLECTION_FILTER, sb.toString()); conf.set(MarkLogicConstants.DOCUMENT_SELECTOR, "fn:collection(" + sb.toString() + ")"); } else { conf.set(MarkLogicConstants.COLLECTION_FILTER, "\"" + c + "\""); conf.set(MarkLogicConstants.DOCUMENT_SELECTOR, "fn:collection(\"" + c + "\")"); } } if (cmdline.hasOption(DIRECTORY_FILTER)) { String d = cmdline.getOptionValue(DIRECTORY_FILTER); String[] df = d.split(","); if (df.length > 1) { StringBuilder sb = new StringBuilder("("); for (int i = 0; i < df.length; i++) { if (i > 0) { sb.append(","); } if (!df[i].endsWith("/")) { LOG.warn( "directory_filter: Directory does not end " + "with a forward slash (/): " + df[i]); } sb.append("\""); sb.append(df[i]); sb.append("\""); } sb.append(")"); conf.set(MarkLogicConstants.DIRECTORY_FILTER, sb.toString()); conf.set(MarkLogicConstants.DOCUMENT_SELECTOR, "xdmp:directory(" + sb.toString() + ",\"infinity\")"); } else { if (!d.endsWith("/")) { LOG.warn("directory_filter: Directory does not end " + "with a forward slash (/): " + d); } conf.set(MarkLogicConstants.DIRECTORY_FILTER, "\"" + d + "\""); conf.set(MarkLogicConstants.DOCUMENT_SELECTOR, "xdmp:directory(\"" + d + "\",\"infinity\")"); } } if (cmdline.hasOption(DOCUMENT_SELECTOR)) { conf.set(MarkLogicConstants.DOCUMENT_SELECTOR, cmdline.getOptionValue(DOCUMENT_SELECTOR)); } if (cmdline.hasOption(PATH_NAMESPACE)) { conf.set(MarkLogicConstants.PATH_NAMESPACE, cmdline.getOptionValue(PATH_NAMESPACE)); } if (cmdline.hasOption(QUERY_FILTER)) { conf.set(MarkLogicConstants.QUERY_FILTER, cmdline.getOptionValue(QUERY_FILTER)); } } static void applyBatchTxn(Configuration conf, CommandLine cmdline, int maxBatch) { String batchSize = cmdline.getOptionValue(BATCH_SIZE); int batch = MarkLogicConstants.DEFAULT_BATCH_SIZE > maxBatch ? maxBatch : MarkLogicConstants.DEFAULT_BATCH_SIZE; if (batchSize != null) { batch = Integer.decode(batchSize); if (batch > maxBatch) { LOG.warn("The setting for " + BATCH_SIZE + " is changed to " + maxBatch); batch = maxBatch; } conf.setInt(MarkLogicConstants.BATCH_SIZE, batch); } String txnSize = cmdline.getOptionValue(TRANSACTION_SIZE); if (txnSize != null) { int txn = Integer.decode(txnSize); if (txn * batch > MAX_TXN_SIZE) { txn = MAX_TXN_SIZE / batch; LOG.warn("The setting for " + TRANSACTION_SIZE + " is changed to " + txn); } conf.setInt(MarkLogicConstants.TXN_SIZE, txn); } } static void applyRDFGraphOutputConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(OUTPUT_GRAPH) && cmdline.hasOption(OUTPUT_OVERRIDE_GRAPH)) { throw new IllegalArgumentException( "Only one of " + OUTPUT_GRAPH + ", " + OUTPUT_OVERRIDE_GRAPH + " can be specified."); } if (cmdline.hasOption(OUTPUT_GRAPH)) { String graph = cmdline.getOptionValue(OUTPUT_GRAPH); conf.set(MarkLogicConstants.OUTPUT_GRAPH, graph); } if (cmdline.hasOption(OUTPUT_OVERRIDE_GRAPH)) { String graph = cmdline.getOptionValue(OUTPUT_OVERRIDE_GRAPH); conf.set(MarkLogicConstants.OUTPUT_OVERRIDE_GRAPH, graph); } } static void applyRedactionConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(REDACTION)) { String value = cmdline.getOptionValue(REDACTION); conf.set(MarkLogicConstants.REDACTION_RULE_COLLECTION, value); } } static void applyCommonOutputConfigOptions(Configuration conf, CommandLine cmdline) { if (cmdline.hasOption(OUTPUT_URI_REPLACE)) { String uriReplace = cmdline.getOptionValue(OUTPUT_URI_REPLACE); if (uriReplace == null) { throw new IllegalArgumentException("Missing option argument: " + OUTPUT_URI_REPLACE); } else { String[] replace = uriReplace.split(","); // URI replace comes in pattern and replacement pairs. if (replace.length % 2 != 0) { throw new IllegalArgumentException( "Invalid option argument for " + OUTPUT_URI_REPLACE + " :" + uriReplace); } // Replacement string is expected to be in '' for (int i = 0; i < replace.length - 1; i++) { String replacement = replace[++i].trim(); if (!replacement.startsWith("'") || !replacement.endsWith("'")) { throw new IllegalArgumentException( "Invalid option argument for " + OUTPUT_URI_REPLACE + " :" + uriReplace); } } conf.setStrings(MarkLogicConstants.CONF_OUTPUT_URI_REPLACE, uriReplace); } } if (cmdline.hasOption(OUTPUT_URI_PREFIX)) { String outPrefix = cmdline.getOptionValue(OUTPUT_URI_PREFIX); conf.set(MarkLogicConstants.CONF_OUTPUT_URI_PREFIX, outPrefix); } if (cmdline.hasOption(OUTPUT_URI_SUFFIX)) { String outSuffix = cmdline.getOptionValue(OUTPUT_URI_SUFFIX); conf.set(MarkLogicConstants.CONF_OUTPUT_URI_SUFFIX, outSuffix); } if (cmdline.hasOption(OUTPUT_COLLECTIONS)) { String collectionsString = cmdline.getOptionValue(OUTPUT_COLLECTIONS); conf.set(MarkLogicConstants.OUTPUT_COLLECTION, collectionsString); } if (cmdline.hasOption(OUTPUT_PERMISSIONS)) { String permissionString = cmdline.getOptionValue(OUTPUT_PERMISSIONS); conf.set(MarkLogicConstants.OUTPUT_PERMISSION, permissionString); } if (cmdline.hasOption(OUTPUT_QUALITY)) { String quantity = cmdline.getOptionValue(OUTPUT_QUALITY); conf.set(MarkLogicConstants.OUTPUT_QUALITY, quantity); } if (cmdline.hasOption(RDF_STREAMING_MEMORY_THRESHOLD)) { String thresh = cmdline.getOptionValue(RDF_STREAMING_MEMORY_THRESHOLD); conf.set(RDF_STREAMING_MEMORY_THRESHOLD, thresh); } if (cmdline.hasOption(RDF_TRIPLES_PER_DOCUMENT)) { String count = cmdline.getOptionValue(RDF_TRIPLES_PER_DOCUMENT); conf.set(RDF_TRIPLES_PER_DOCUMENT, count); } } static InputType getInputType(CommandLine cmdline) { String inputTypeOption = cmdline.getOptionValue(INPUT_FILE_TYPE, INPUT_FILE_TYPE_DEFAULT); return InputType.forName(inputTypeOption); } static String getNewJobName(Configuration conf) { String mode = conf.get(MarkLogicConstants.EXECUTION_MODE, MarkLogicConstants.MODE_LOCAL); StringBuilder jobNameBuf = new StringBuilder(); jobNameBuf.append(mode); jobNameBuf.append('_'); jobNameBuf.append(rand.nextInt(Integer.MAX_VALUE)); jobNameBuf.append('_'); jobNameBuf.append(++jobid); return jobNameBuf.toString(); } public void printUsage(Command cmd, Options options) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(HelpFormatter.DEFAULT_WIDTH, cmd.name(), null, options, null, true); } }