Java tutorial
/* * Copyright 2016, Jerry Xiong, BOSSCS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bosscs.spark.mongodb.config; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.FILTER_QUERY; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.IGNORE_ID_FIELD; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.INPUT_KEY; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.READ_PREFERENCE; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.REPLICA_SET; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.SORT; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.SPLIT_SIZE; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.USE_CHUNKS; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.USE_SHARD; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.USE_SPLITS; import static com.bosscs.spark.commons.extractor.utils.ExtractorConstants.WRITE_MODE; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; import com.mongodb.BasicDBObject; import com.mongodb.DBObject; import com.mongodb.QueryBuilder; import com.mongodb.ReadPreference; import com.mongodb.WriteConcern; import com.mongodb.hadoop.util.MongoConfigUtil; import com.bosscs.spark.commons.config.ExtractorConfig; import com.bosscs.spark.commons.config.HadoopConfig; import com.bosscs.spark.commons.entity.Cells; import com.bosscs.spark.commons.filter.Filter; import com.bosscs.spark.commons.filter.FilterType; import com.bosscs.spark.mongodb.extractor.MongoNativeCellExtractor; import com.bosscs.spark.mongodb.extractor.MongoNativeEntityExtractor; /** * The type Mongo deep job config. * * @param <T> the type parameter */ public class MongoDeepJobConfig<T> extends HadoopConfig<T, MongoDeepJobConfig<T>> implements IMongoDeepJobConfig<T, MongoDeepJobConfig<T>>, Serializable { /** * The constant serialVersionUID. */ private static final long serialVersionUID = -7179376653643603038L; /** * configuration to be broadcasted to every spark node */ private transient Configuration configHadoop; /** * Indicates the replica set's name */ private String replicaSet; /** * Read Preference primaryPreferred is the recommended read preference. If the primary node go down, can still read * from secundaries */ private String readPreference = ReadPreference.nearest().getName(); /** * OPTIONAL filter query */ private DBObject query; /** * OPTIONAL fields to be returned */ private DBObject fields; /** * OPTIONAL sorting */ private String sort; /** * Shard key */ private String inputKey; /** * The Create input split. */ private boolean createInputSplit = true; /** * The Use shards. */ private boolean useShards = false; /** * The Splits use chunks. */ private boolean splitsUseChunks = true; /** * The Split size. */ private Integer splitSize = 8; /** * The Custom configuration. */ private Map<String, Serializable> customConfiguration; private WriteConcern writeConcern = WriteConcern.NORMAL; public MongoDeepJobConfig() { } /** * Instantiates a new Mongo deep job config. * * @param entityClass the entity class */ public MongoDeepJobConfig(Class<T> entityClass) { super(entityClass); if (Cells.class.isAssignableFrom(entityClass)) { extractorImplClass = MongoNativeCellExtractor.class; } else { extractorImplClass = MongoNativeEntityExtractor.class; } } /** * set the page size. * @param pageSize the page size. * @return the mongo config. */ public MongoDeepJobConfig<T> pageSize(int pageSize) { this.splitSize = pageSize; return this; } /** * {@inheritDoc} */ @Override public String getUsername() { return username; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> filterQuery(DBObject query) { this.query = query; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> filterQuery(QueryBuilder query) { this.query = query.get(); return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> replicaSet(String replicaSet) { this.replicaSet = replicaSet; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> database(String database) { this.catalog = database; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> collection(String collection) { this.table = collection; return this; } @Override public String getCollection() { return table; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> username(String username) { this.username = username; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> fields(DBObject fields) { this.fields = fields; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> sort(String sort) { this.sort = sort; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> sort(DBObject sort) { this.sort = sort.toString(); return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> createInputSplit(boolean createInputSplit) { this.createInputSplit = createInputSplit; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> useShards(boolean useShards) { this.useShards = useShards; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> splitsUseChunks(boolean splitsUseChunks) { this.splitsUseChunks = splitsUseChunks; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> inputKey(String inputKey) { this.inputKey = inputKey; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> password(String password) { this.password = password; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> readPreference(String readPreference) { this.readPreference = readPreference; return this; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> ignoreIdField() { DBObject bsonFields = fields != null ? fields : new BasicDBObject(); bsonFields.put("_id", 0); fields = bsonFields; return this; } @Override public String getDatabase() { return catalog; } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> initialize() { validate(); super.initialize(); configHadoop = new JobConf(); configHadoop = new Configuration(); StringBuilder connection = new StringBuilder(); connection.append("mongodb").append(":").append("//"); if (username != null && password != null) { connection.append(username).append(":").append(password).append("@"); } boolean firstHost = true; for (String hostName : host) { if (!firstHost) { connection.append(","); } connection.append(hostName); firstHost = false; } connection.append("/").append(catalog).append(".").append(table); StringBuilder options = new StringBuilder(); boolean asignado = false; if (readPreference != null) { asignado = true; options.append("?readPreference=").append(readPreference); } if (replicaSet != null) { if (asignado) { options.append("&"); } else { options.append("?"); } options.append("replicaSet=").append(replicaSet); } connection.append(options); configHadoop.set(MongoConfigUtil.INPUT_URI, connection.toString()); configHadoop.set(MongoConfigUtil.OUTPUT_URI, connection.toString()); configHadoop.set(MongoConfigUtil.INPUT_SPLIT_SIZE, String.valueOf(splitSize)); if (inputKey != null) { configHadoop.set(MongoConfigUtil.INPUT_KEY, inputKey); } configHadoop.set(MongoConfigUtil.SPLITS_USE_SHARDS, String.valueOf(useShards)); configHadoop.set(MongoConfigUtil.CREATE_INPUT_SPLITS, String.valueOf(createInputSplit)); configHadoop.set(MongoConfigUtil.SPLITS_USE_CHUNKS, String.valueOf(splitsUseChunks)); if (query != null) { configHadoop.set(MongoConfigUtil.INPUT_QUERY, query.toString()); } if (fields != null) { configHadoop.set(MongoConfigUtil.INPUT_FIELDS, fields.toString()); } if (sort != null) { configHadoop.set(MongoConfigUtil.INPUT_SORT, sort); } if (username != null && password != null) { configHadoop.set(MongoConfigUtil.INPUT_URI, connection.toString()); } if (customConfiguration != null) { Set<Map.Entry<String, Serializable>> set = customConfiguration.entrySet(); Iterator<Map.Entry<String, Serializable>> iterator = set.iterator(); while (iterator.hasNext()) { Map.Entry<String, Serializable> entry = iterator.next(); configHadoop.set(entry.getKey(), entry.getValue().toString()); } } return this; } private void concantHostPort() { for (int i = 0; i < host.size(); i++) { if (host.get(i).indexOf(":") == -1) { host.set(i, host.get(i).concat(":").concat(String.valueOf(port))); } } } /** * validates connection parameters */ private void validate() { if (host.isEmpty()) { throw new IllegalArgumentException("host cannot be null"); } if (catalog == null) { throw new IllegalArgumentException("database cannot be null"); } if (table == null) { throw new IllegalArgumentException("collection cannot be null"); } concantHostPort(); } /** * {@inheritDoc} */ @Override public MongoDeepJobConfig<T> inputColumns(String... columns) { DBObject bsonFields = fields != null ? fields : new BasicDBObject(); boolean isIdPresent = false; for (String column : columns) { if (column.trim().equalsIgnoreCase("_id")) { isIdPresent = true; } bsonFields.put(column.trim(), 1); } if (!isIdPresent) { bsonFields.put("_id", 0); } fields = bsonFields; return this; } /** * {@inheritDoc} */ @Override public Configuration getHadoopConfiguration() { if (configHadoop == null) { initialize(); } return configHadoop; } @Override public MongoDeepJobConfig<T> initialize(ExtractorConfig extractorConfig) { super.initialize(extractorConfig); Map<String, Serializable> values = extractorConfig.getValues(); if (values.get(REPLICA_SET) != null) { replicaSet(extractorConfig.getString(REPLICA_SET)); } if (values.get(READ_PREFERENCE) != null) { readPreference(extractorConfig.getString(READ_PREFERENCE)); } if (values.get(SORT) != null) { sort(extractorConfig.getString(SORT)); } if (values.get(FILTER_QUERY) != null) { filterQuery(extractorConfig.getFilterArray(FILTER_QUERY)); } if (values.get(INPUT_KEY) != null) { inputKey(extractorConfig.getString(INPUT_KEY)); } if (values.get(IGNORE_ID_FIELD) != null && extractorConfig.getBoolean(IGNORE_ID_FIELD) == true) { ignoreIdField(); } if (values.get(INPUT_KEY) != null) { inputKey(extractorConfig.getString(INPUT_KEY)); } if (values.get(USE_SHARD) != null) { useShards(extractorConfig.getBoolean(USE_SHARD)); } if (values.get(USE_SPLITS) != null) { createInputSplit(extractorConfig.getBoolean(USE_SPLITS)); } if (values.get(USE_CHUNKS) != null) { splitsUseChunks(extractorConfig.getBoolean(USE_CHUNKS)); } if (values.get(SPLIT_SIZE) != null) { pageSize(extractorConfig.getInteger(SPLIT_SIZE)); } if (values.get(WRITE_MODE) != null) { writeConcern((WriteConcern) extractorConfig.getValue(WriteConcern.class, WRITE_MODE)); } this.initialize(); return this; } /** * Filter query. * * @param filters the filters * @return the mongo deep job config */ public MongoDeepJobConfig<T> filterQuery(Filter[] filters) { if (filters.length > 0) { List<BasicDBObject> list = new ArrayList<>(); QueryBuilder queryBuilder = QueryBuilder.start(); for (int i = 0; i < filters.length; i++) { BasicDBObject bsonObject = new BasicDBObject(); Filter filter = filters[i]; if (filter.getFilterType().equals(FilterType.EQ)) { bsonObject.put(filter.getField(), filter.getValue()); } else { bsonObject.put(filter.getField(), new BasicDBObject( "$".concat(filter.getFilterType().getFilterTypeId().toLowerCase()), filter.getValue())); } list.add(bsonObject); } queryBuilder.and(list.toArray(new BasicDBObject[list.size()]).toString()); filterQuery(queryBuilder); } return this; } /** * Gets input key. * * @return the input key */ public String getInputKey() { return inputKey; } /** * Sets input key. * * @param inputKey the input key */ public void setInputKey(String inputKey) { this.inputKey = inputKey; } /** * Is create input split. * * @return the boolean */ public boolean isCreateInputSplit() { return createInputSplit; } /** * Sets create input split. * * @param createInputSplit the create input split */ public void setCreateInputSplit(boolean createInputSplit) { this.createInputSplit = createInputSplit; } /** * Is use shards. * * @return the boolean */ public boolean isUseShards() { return useShards; } /** * Sets use shards. * * @param useShards the use shards */ public void setUseShards(boolean useShards) { this.useShards = useShards; } /** * Is splits use chunks. * * @return the boolean */ public boolean isSplitsUseChunks() { return splitsUseChunks; } /** * Sets splits use chunks. * * @param splitsUseChunks the splits use chunks */ public void setSplitsUseChunks(boolean splitsUseChunks) { this.splitsUseChunks = splitsUseChunks; } /** * Gets split size. * * @return the split size */ public Integer getSplitSize() { return splitSize; } /** * Sets split size. * * @param splitSize the split size */ public void setSplitSize(Integer splitSize) { this.splitSize = splitSize; } /** * Gets query. * * @return the query */ public DBObject getQuery() { return query; } public DBObject getDBFields() { return fields; } public WriteConcern getWriteConcern() { return writeConcern; } public MongoDeepJobConfig<T> writeConcern(WriteConcern writeConcern) { this.writeConcern = writeConcern; return this; } public MongoDeepJobConfig<T> query(DBObject query) { this.query = query; return this; } public String getReplicaSet() { return replicaSet; } public String getReadPreference() { return readPreference; } public String getSort() { return sort; } @Override public String toString() { final StringBuffer sb = new StringBuffer("MongoDeepJobConfig{"); sb.append("configHadoop=").append(configHadoop); sb.append(", replicaSet='").append(replicaSet).append('\''); sb.append(", readPreference='").append(readPreference).append('\''); sb.append(", query=").append(query); sb.append(", fields=").append(fields); sb.append(", sort='").append(sort).append('\''); sb.append(", inputKey='").append(inputKey).append('\''); sb.append(", createInputSplit=").append(createInputSplit); sb.append(", useShards=").append(useShards); sb.append(", splitsUseChunks=").append(splitsUseChunks); sb.append(", splitSize=").append(splitSize); sb.append(", writeConcern=").append(writeConcern); sb.append(", customConfiguration=").append(customConfiguration); sb.append('}'); sb.append(super.toString()); return sb.toString(); } }