com.ikanow.infinit.e.data_model.store.config.source.SourcePojo.java Source code

Introduction

Here is the source code for com.ikanow.infinit.e.data_model.store.config.source.SourcePojo.java
Source

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.ikanow.infinit.e.data_model.store.config.source;

import java.io.UnsupportedEncodingException;
import java.lang.reflect.Type;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Base64;
import org.bson.types.ObjectId;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.social.authentication.AuthenticationPojo;
import com.mongodb.BasicDBObject;

/**
 * Class used to establish the source information for a feed
 * this defines the data necessary to create a feed in the system
 * 
 * @author cmorgan
 *
 */

public class SourcePojo extends BaseDbPojo {
    // Standard static function for readability
    @SuppressWarnings("unchecked")
    static public TypeToken<List<SourcePojo>> listType() {
        return new TypeToken<List<SourcePojo>>() {
        };
    }

    /** 
      * Private Class Variables
      */

    // Metadata fields

    private ObjectId _id = null;
    final public static String _id_ = "_id";
    private Date created = null;
    final public static String created_ = "created";
    private Date modified = null;
    final public static String modified_ = "modified";
    private String url = null;
    final public static String url_ = "url";
    private String title = null;
    final public static String title_ = "title";
    private Boolean isPublic = null; // if false then many fields are removed when viewed by non-owners/moderators/admins 
    final public static String isPublic_ = "isPublic";
    private Boolean partiallyPublished = null; // if fields are removed based on isPublic then this is set to true
    final public static String partiallyPublished_ = "partiallyPublished";
    private ObjectId ownerId = null;
    final public static String ownerId_ = "ownerId";
    private String author = null;
    final public static String author_ = "author";

    private String mediaType = null;
    final public static String mediaType_ = "mediaType";
    private String key = null;
    final public static String key_ = "key";
    private String description = null;
    final public static String description_ = "description";
    private Set<String> tags = null;
    final public static String tags_ = "tags";

    private Set<ObjectId> communityIds = null;
    final public static String communityIds_ = "communityIds";

    private boolean isApproved = false;
    final public static String isApproved_ = "isApproved";
    private boolean harvestBadSource = false;
    final public static String harvestBadSource_ = "harvestBadSource";

    private String extractType = null; // (in pipeline mode, copied across from pipeline)
    final public static String extractType_ = "extractType";

    private String shah256Hash = null;
    final public static String shah256Hash_ = "shah256Hash";

    // Control fields used everywhere

    private Integer searchCycle_secs = null; // Determines the time between searches, defaults as quickly as the harvest can cycle
    // (in pipeline mode, copied across from pipeline)
    final public static String searchCycle_secs_ = "searchCycle_secs";

    private Integer distributionFactor;
    final public static String distributionFactor_ = "distributionFactor";

    private Integer highestDistributionFactorStored; // (for higher speed distributed storage, this persistent field keeps track of the biggest number used) 
    final public static String highestDistributionFactorStored_ = "highestDistributionFactorStored";

    transient private Collection<String> _distributedKeys; // (cached copy of the distributed keys calculated from highestDistributionFactorStored)
    transient private Object _distributedKeyQueryTerm; // (either a string or a BasicDBOjbect containing a list of keys) 

    private Set<ObjectId> federatedQueryCommunityIds = null; // (populated with communityIds if the source is a federated query - just used for efficient lookups from queries)
    final public static String federatedQueryCommunityIds_ = "federatedQueryCommunityIds";

    public static class SourceSearchIndexFilter {
        public Boolean indexOnIngest = null; // (if specified and false, default:true, then don't index the docs at all)
        public String entityFilter = null; // (regex applied to entity indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
        public String assocFilter = null; // (regex applied to new-line separated association indexes, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
        public String entityGeoFilter = null; // (regex applied to entity indexes if the entity has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
        public String assocGeoFilter = null; // (regex applied to new-line separated association indexes if the association has geo, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
        public String fieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)
        public String metadataFieldList = null; // (comma-separated list of doc fields, starts with "+" or "-" to indicate inclusion/exclusion, defaults to include-only)

        // temp:
        public transient Pattern entityFilterRegex;
        public transient Pattern assocFilterRegex;
        public transient Pattern entityGeoFilterRegex;
        public transient Pattern assocGeoFilterRegex;
    }

    // PROCESSING PIPELINE

    private List<SourcePipelinePojo> processingPipeline;
    final public static String processingPipeline_ = "processingPipeline";

    // The new template-based capability:
    private BasicDBObject templateProcessingFlow;
    final public static String templateProcessingFlow_ = "templateProcessingFlow";

    // LEGACY CODE, IGNORED IN PROCESSING-PIPELINE MODE

    private SourceHarvestStatusPojo harvest = null;
    final public static String harvest_ = "harvest";
    private SourceDatabaseConfigPojo database = null;
    final public static String database_ = "database";
    private SourceNoSqlConfigPojo nosql = null;
    final public static String nosql_ = "nosql";

    private SourceFileConfigPojo file = null;
    final public static String file_ = "file";
    private SourceRssConfigPojo rss = null;
    final public static String rss_ = "rss";

    private AuthenticationPojo authentication = null;
    final public static String authentication_ = "authentication";

    private String useExtractor = null;
    final public static String useExtractor_ = "useExtractor";
    private String useTextExtractor = null;
    final public static String useTextExtractor_ = "useTextExtractor";

    private StructuredAnalysisConfigPojo structuredAnalysis = null;
    final public static String structuredAnalysis_ = "structuredAnalysis";
    private UnstructuredAnalysisConfigPojo unstructuredAnalysis = null;
    final public static String unstructuredAnalysis_ = "unstructuredAnalysis";

    private Integer maxDocs = null; // Limits the number of docs that can be stored for this source at any one time
    final public static String maxDocs_ = "maxDocs";
    private Integer timeToLive_days = null; // Sets a time to live for the documents harvested, after which they are deleted
    final public static String timeToLive_days_ = "timeToLive_days";
    private Integer throttleDocs = null; // Limits the number of docs that can be harvested in one cycle (cannot be higher than system setting in harvest.maxdocs_persource)
    final public static String throttleDocs_ = "throttleDocs";
    private Boolean duplicateExistingUrls; // If false (defaults: true) will ignore docs harvested by other sources in the community
    final public static String duplicateExistingUrls_ = "duplicateExistingUrls";
    private Boolean appendTagsToDocs = null; // if true (default) source tags are appended to the document

    final public static String appendTagsToDocs_ = "appendTagsToDocs";

    private SourceSearchIndexFilter searchIndexFilter = null; // Optional, allows the source builder to configure which fields are searchable
    final public static String searchIndexFilter_ = "searchIndexFilter";

    private LinkedHashMap<String, String> extractorOptions = null; // Optional, overrides the per-extractor configuration options, where permissible
    final public static String extractorOptions_ = "extractorOptions";

    //////////////////////////////////////

    // Gets and sets

    public AuthenticationPojo getAuthentication() {
        return authentication;
    }

    public void setAuthentication(AuthenticationPojo authentication) {
        this.authentication = authentication;
    }

    public SourceFileConfigPojo getFileConfig() {
        return file;
    }

    public void setFileConfig(SourceFileConfigPojo file) {
        this.file = file;
    }

    public SourceRssConfigPojo getRssConfig() {
        return rss;
    }

    public void setRssConfig(SourceRssConfigPojo rss) {
        this.rss = rss;
    }

    public SourceDatabaseConfigPojo getDatabaseConfig() {
        return database;
    }

    public void setDatabaseConfig(SourceDatabaseConfigPojo database) {
        this.database = database;
    }

    public ObjectId getId() {
        return _id;
    }

    public void setId(ObjectId id) {
        this._id = id;
    }

    public Collection<String> getDistributedKeys() {
        if (null != _distributedKeys) {
            return _distributedKeys;
        }
        _distributedKeys = getDistributedKeys(key, highestDistributionFactorStored);
        return _distributedKeys;
    }//TESTED (see static version) 

    public Object getDistributedKeyQueryTerm() {
        if (null != this._distributedKeyQueryTerm) {
            return _distributedKeyQueryTerm;
        } else if (null == highestDistributionFactorStored) {
            _distributedKeyQueryTerm = key;
        } else {
            BasicDBObject queryTerm = new BasicDBObject(DbManager.gte_, key);
            queryTerm.put(DbManager.lt_, key + "#:");
            _distributedKeyQueryTerm = queryTerm;
        }
        return _distributedKeyQueryTerm;
    }//TESTED (by hand, both clauses)

    public static Object getDistributedKeyQueryTerm(String key) {
        return getDistributedKeyQueryTerm(key, 1);
    }//TESTED

    public static Object getDistributedKeyQueryTerm(String key, Integer highestDistributionFactorStored) {
        if (null == highestDistributionFactorStored) {
            return key;
        } else {
            BasicDBObject queryTerm = new BasicDBObject(DbManager.gte_, key);
            queryTerm.put(DbManager.lt_, key + "#:");
            return queryTerm;
        }
    }//TESTED

    public static Collection<String> getDistributedKeys(String key, Integer highestDistributionFactorStored) {
        int numShards = 1;
        if (null != highestDistributionFactorStored) {
            numShards = highestDistributionFactorStored;
        }
        ArrayList<String> distributedKeys = new ArrayList<String>(numShards);
        StringBuffer keySb = new StringBuffer(key).append("#");
        int originalLength = keySb.length();
        for (int i = 0; i < numShards; i++) {
            if (0 == i) {
                distributedKeys.add(key);
            } else {
                keySb.append(i);
                distributedKeys.add(keySb.toString());
                keySb.setLength(originalLength);
            }
        }
        return distributedKeys;
    }//TESTED (by hand, both cases)

    public String getKey() {
        return key;
    }

    public void setKey(String key) {
        this.key = key;
    }

    public Date getCreated() {
        return created;
    }

    public void setCreated(Date created) {
        this.created = created;
    }

    public Date getModified() {
        return modified;
    }

    public void setModified(Date modified) {
        this.modified = modified;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getMediaType() {
        return mediaType;
    }

    public void setMediaType(String mediaType) {
        this.mediaType = mediaType;
    }

    public String getExtractType() {
        return extractType;
    }

    public void setExtractType(String extractType) {
        this.extractType = extractType;
    }

    public Boolean getIsPublic() {
        return isPublic;
    }

    public boolean isPublic() {
        return (isPublic == null) ? false : isPublic; // (ie defaults to false)
    }

    public void setIsPublic(Boolean isPublic) {
        this.isPublic = isPublic;
    }

    public void setPublic(boolean isPublic) {
        this.isPublic = isPublic;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    /** 
      * Get the tags
      */
    public Set<String> getTags() {
        return tags;
    }

    /** 
      * Set the tags
      */
    public void setTags(Set<String> tags) {
        this.tags = tags;
    }

    /**
     * @param ownerID the ownerID to set
     */
    public void setOwnerId(ObjectId ownerID) {
        this.ownerId = ownerID;
    }

    /**
     * @return the ownerID
     */
    public ObjectId getOwnerId() {
        return ownerId;
    }

    public SourcePojo() {

    }

    public void setHarvestStatus(SourceHarvestStatusPojo harvest) {
        this.harvest = harvest;
    }

    public SourceHarvestStatusPojo getHarvestStatus() {
        return harvest;
    }

    public void setApproved(boolean isApproved) {
        this.isApproved = isApproved;
    }

    public boolean isApproved() {
        return isApproved;
    }

    public void addToCommunityIds(ObjectId communityID) {
        if (null == this.communityIds) {
            this.communityIds = new HashSet<ObjectId>();
        }
        this.communityIds.add(communityID);
    }

    public void removeFromCommunityIds(ObjectId communityID) {
        if (null != this.communityIds) {
            this.communityIds.remove(communityID);
        }
    }

    public Set<ObjectId> getCommunityIds() {
        return communityIds;
    }

    public void setCommunityIds(Set<ObjectId> ids) {
        communityIds = ids;
    }

    public void setHarvestBadSource(boolean harvestBadSource) {
        this.harvestBadSource = harvestBadSource;
    }

    public boolean isHarvestBadSource() {
        return harvestBadSource;
    }

    /**
     * @param useExtractor the useExtractor to set
     */
    public void setUseExtractor(String useExtractor) {
        this.useExtractor = useExtractor;
    }

    /**
     * @return the useExtractor
     */
    public String useExtractor() {
        return useExtractor;
    }

    /**
     * @param useTextExtractor the useTextExtractor to set
     */
    public void setUseTextExtractor(String useTextExtractor) {
        this.useTextExtractor = useTextExtractor;
    }

    /**
     * @return the useTextExtractor
     */
    public String useTextExtractor() {
        return useTextExtractor;
    }

    /**
     * @param structedAnalysis the structedAnalysis to set
     */
    public void setStructuredAnalysisConfig(StructuredAnalysisConfigPojo structuredAnalysis) {
        this.structuredAnalysis = structuredAnalysis;
    }

    /**
     * @return the structedAnalysis
     */
    public StructuredAnalysisConfigPojo getStructuredAnalysisConfig() {
        return structuredAnalysis;
    }

    /**
     * @param structuredAnalysis the structuredAnalysis to set
     */
    public void setUnstructuredAnalysisConfig(UnstructuredAnalysisConfigPojo unstructuredAnalysis) {
        this.unstructuredAnalysis = unstructuredAnalysis;
    }

    /**
     * @return the unstructuredAnalysis
     */
    public UnstructuredAnalysisConfigPojo getUnstructuredAnalysisConfig() {
        return unstructuredAnalysis;
    }

    /**
     * setShah256Hash - calls generateShah256Hash
     */
    public void generateShah256Hash() {
        try {
            generateShah256Hash_internal();
        } catch (Exception e) {

        }
    }

    /**
     * getShah256Hash - calls generateShah256Hash if shah256Hash is null
     * @return
     */
    public String getShah256Hash() {
        if (null != shah256Hash) {
            return shah256Hash;
        } else {
            try {
                generateShah256Hash_internal();
                return shah256Hash;
            } catch (Exception e) {
                return null;
            }
        }
    }
    // Utility:

    /**
     * generateSourceKey
     * Strips out http://, smb:// /, :, etc. from the URL field to generate
     * Example: http://www.ikanow.com/rss -> www.ikanow.com.rss
     */
    public String generateSourceKey() {
        String s = getRepresentativeUrl(); // (supports all cases - note we are guaranteed to have a URL by this point)
        if (null == s) {
            return null;
        }

        int nIndex = s.indexOf('?');
        final int nMaxLen = 64; // (+24 for the object id, + random other stuff, keeps it in the <100 range)
        if (nIndex >= 0) {
            if (nIndex > nMaxLen) {
                nIndex = nMaxLen; // (ie max length)
            }
            StringBuffer sb = new StringBuffer(s.substring(0, nIndex));
            sb.append(".").append(s.length() - nIndex).append('.').append(Math.abs(s.hashCode()) % 100);
            s = sb.toString();
        } else if (s.length() > nMaxLen) {
            s = s.substring(0, nMaxLen);
        }
        //TESTED (urls with and without ?)

        s = s.replaceAll("http://|https://|smb://|ftp://|ftps://|file://|[^a-zA-Z0-9_.]", ".");
        if (s.startsWith("."))
            s = s.substring(1);
        return s;
    }

    /**
     * generateShah256Hash
     * Combines the required and optional fields of a SourcePojo into a string that is
     * then hashed using SHAH-256 and saved to the SourePojo.shah256Hash field;
     * this value is used to determine source uniqueness
     * @throws NoSuchAlgorithmException
     * @throws UnsupportedEncodingException
     */
    private void generateShah256Hash_internal() throws NoSuchAlgorithmException, UnsupportedEncodingException {
        // Create StringBuffer with fields to use to establish source *processing* uniqueness
        StringBuffer sb = new StringBuffer();

        // (Note what I mean by "source processing uniqueness" is that, *for a specific doc URL* 2 sources would process it identically)   
        // So fields like key,URL,media type,tags,etc aren't included in the hash

        if (null != processingPipeline) { // new processing pipeline contains all the logic that determines a source's processing
            for (SourcePipelinePojo pxPipe : processingPipeline) {
                if ((null == pxPipe.feed) && (null == pxPipe.web)) { // (these are too difficult to pull the URL out of)
                    String fileUrl = null;
                    if (null != pxPipe.file) {
                        fileUrl = pxPipe.file.getUrl();
                        pxPipe.file.setUrl(null);
                    }
                    // (don't both with DB because its URL is so intertwined with its processing)
                    sb.append(new Gson().toJson(pxPipe));
                    if (null != fileUrl) {
                        pxPipe.file.setUrl(fileUrl);
                    } // (stay idempotent)
                }
            }
        } //TESTED
        else { //legacy case

            // Required Fields
            sb.append(this.extractType);

            // Optional fields
            if (this.extractType != null)
                sb.append(this.extractType);
            if (this.useExtractor != null)
                sb.append(this.useExtractor);
            if (this.useTextExtractor != null)
                sb.append(this.useTextExtractor);

            // Generate a hash of all the objects using the ORM layer
            SourcePojo newSrc = new SourcePojo();
            newSrc.setId(null); // (in case this is auto set by the c'tor)
            newSrc.setAuthentication(this.authentication);
            newSrc.setDatabaseConfig(this.database);
            newSrc.setFileConfig(this.file);
            // Don't include RSS config since it can contain URLs
            newSrc.setStructuredAnalysisConfig(this.structuredAnalysis);
            newSrc.setUnstructuredAnalysisConfig(this.unstructuredAnalysis);
            sb.append(((BasicDBObject) newSrc.toDb()).toString());

        } //TESTED (legacy)

        // Create MessageDigest and set shah256Hash value
        MessageDigest md = MessageDigest.getInstance("SHA-256");
        md.update(sb.toString().getBytes("UTF-8"));
        shah256Hash = Base64.encodeBase64String(md.digest());
    }

    public Integer getSearchCycle_secs() {
        return searchCycle_secs;
    }

    public void setSearchCycle_secs(Integer searchCycle_secs) {
        this.searchCycle_secs = searchCycle_secs;
    }

    public void setMaxDocs(Integer maxDocs) {
        this.maxDocs = maxDocs;
    }

    public Integer getMaxDocs() {
        return maxDocs;
    }

    public void setReachedMaxDocs() {
        this.reachedMaxDocs = true;
    }

    public boolean reachedMaxDocs() {
        return reachedMaxDocs;
    }

    public void setDuplicateExistingUrls(Boolean duplicateExistingUrls) {
        this.duplicateExistingUrls = duplicateExistingUrls;
    }

    public boolean getDuplicateExistingUrls() { // (defaults to true)
        return duplicateExistingUrls == null ? true : duplicateExistingUrls;
    }

    public SourceSearchIndexFilter getSearchIndexFilter() {
        initSearchIndexFilter(searchIndexFilter);
        return searchIndexFilter;
    }

    public void setSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
        this.searchIndexFilter = searchIndexFilter;
    }
    ///////////////////////////////////////////////////////////////////////////////////

    // Transient state (implementation details)

    transient private boolean reachedMaxDocs = false;
    // (if set to true, means that the next search cycle won't be applied - otherwise if you only search once per day
    //  and only process 5K docs/search, it can take a while to build up large repositories)

    private transient Set<Integer> distributionTokens; // (temporary internal state for managing intra-source distribution)

    private transient Boolean ownedByAdmin = null;

    // Build some regexes:

    public static void initSearchIndexFilter(SourceSearchIndexFilter searchIndexFilter) {
        if (null != searchIndexFilter) { // Initialize regex
            if ((null != searchIndexFilter.assocFilter) && (null == searchIndexFilter.assocFilterRegex)) {
                if (searchIndexFilter.assocFilter.startsWith("+")
                        || searchIndexFilter.assocFilter.startsWith("-")) {
                    searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter.substring(1),
                            Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.MULTILINE);
                } else {
                    searchIndexFilter.assocFilterRegex = Pattern.compile(searchIndexFilter.assocFilter,
                            Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.MULTILINE);
                }
            }
            if ((null != searchIndexFilter.assocGeoFilter) && (null == searchIndexFilter.assocGeoFilterRegex)) {
                if (searchIndexFilter.assocGeoFilter.startsWith("+")
                        || searchIndexFilter.assocGeoFilter.startsWith("-")) {
                    searchIndexFilter.assocGeoFilterRegex = Pattern.compile(
                            searchIndexFilter.assocGeoFilter.substring(1),
                            Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.MULTILINE);
                } else {
                    searchIndexFilter.assocGeoFilterRegex = Pattern.compile(searchIndexFilter.assocGeoFilter,
                            Pattern.CASE_INSENSITIVE | Pattern.DOTALL | Pattern.MULTILINE);
                }
            }
            if ((null != searchIndexFilter.entityFilter) && (null == searchIndexFilter.entityFilterRegex)) {
                if (searchIndexFilter.entityFilter.startsWith("+")
                        || searchIndexFilter.entityFilter.startsWith("-")) {
                    searchIndexFilter.entityFilterRegex = Pattern
                            .compile(searchIndexFilter.entityFilter.substring(1), Pattern.CASE_INSENSITIVE);
                } else {
                    searchIndexFilter.entityFilterRegex = Pattern.compile(searchIndexFilter.entityFilter,
                            Pattern.CASE_INSENSITIVE);
                }
            }
            if ((null != searchIndexFilter.entityGeoFilter) && (null == searchIndexFilter.entityGeoFilterRegex)) {
                if (searchIndexFilter.entityGeoFilter.startsWith("+")
                        || searchIndexFilter.entityGeoFilter.startsWith("-")) {
                    searchIndexFilter.entityGeoFilterRegex = Pattern
                            .compile(searchIndexFilter.entityGeoFilter.substring(1), Pattern.CASE_INSENSITIVE);
                } else {
                    searchIndexFilter.entityGeoFilterRegex = Pattern.compile(searchIndexFilter.entityGeoFilter,
                            Pattern.CASE_INSENSITIVE);
                }
            }
        } // (end if search filter specified)
    }//(end initialize search filter)

    public void setExtractorOptions(LinkedHashMap<String, String> extractorOptions) {
        this.extractorOptions = extractorOptions;
    }

    public LinkedHashMap<String, String> getExtractorOptions() {
        return extractorOptions;
    }
    //TESTED

    public void setProcessingPipeline(List<SourcePipelinePojo> processingPipeline) {
        this.processingPipeline = processingPipeline;
    }

    public List<SourcePipelinePojo> getProcessingPipeline() {
        return processingPipeline;
    }

    public void setAppendTagsToDocs(Boolean appendTagsToDocs) {
        this.appendTagsToDocs = appendTagsToDocs;
    }

    public Boolean getAppendTagsToDocs() {
        return appendTagsToDocs;
    }

    public void setNoSql(SourceNoSqlConfigPojo noSql) {
        this.nosql = noSql;
    }

    public SourceNoSqlConfigPojo getNoSql() {
        return nosql;
    }

    public void setDistributionFactor(Integer distributionFactor) {
        this.distributionFactor = distributionFactor;
    }

    public Integer getDistributionFactor() {
        return distributionFactor;
    }

    public void setDistributionTokens(Set<Integer> distributionTokens) {
        this.distributionTokens = distributionTokens;
    }

    public Set<Integer> getDistributionTokens() {
        return distributionTokens;
    }

    public void setThrottleDocs(Integer throttleDocs) {
        this.throttleDocs = throttleDocs;
    }

    public Integer getThrottleDocs() {
        return throttleDocs;
    }

    ///////////////////////////////////////////////////////////////////

    // Serialization/deserialization utils:
    // (Ugh needed because extractorOptions keys can contain "."s)

    public GsonBuilder extendBuilder(GsonBuilder gp) {
        return gp.registerTypeAdapter(SourcePojo.class, new SourcePojoDeserializer())
                .registerTypeAdapter(SourcePojo.class, new SourcePojoSerializer());
    }

    protected static class SourcePojoDeserializer implements JsonDeserializer<SourcePojo> {
        @Override
        public SourcePojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
                throws JsonParseException {
            SourcePojo src = new SourceFederatedQueryConfigPojo().extendBuilder(BaseDbPojo.getDefaultBuilder())
                    .create().fromJson(json, SourcePojo.class);
            //(note the src api sub map bypasses this but explicity adds the SourceFederatedQueryConfigPojo itself)

            if (null != src.extractorOptions) {
                src.extractorOptions = decodeKeysForDatabaseStorage(src.extractorOptions);
            }
            if (null != src.processingPipeline) {
                for (SourcePipelinePojo pxPipe : src.processingPipeline) {
                    if ((null != pxPipe.web) || (null != pxPipe.feed)) {
                        SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
                        if (null != webOrFeed.getHttpFields()) {
                            webOrFeed.setHttpFields(decodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
                        }
                    } //TESTED (added httpFields by hand)
                      // (don't do lookup tables, "."s aren't allowed in their keys)
                    if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
                        pxPipe.featureEngine.engineConfig = decodeKeysForDatabaseStorage(
                                pxPipe.featureEngine.engineConfig);
                    } //TESTED (basic_web_test_ocOptions)
                    if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
                        pxPipe.textEngine.engineConfig = decodeKeysForDatabaseStorage(
                                pxPipe.textEngine.engineConfig);
                    } //TESTED (c/p basic_web_test_ocOptions)
                }
            }
            return src;
        }//TESTED (with and without extractor options)
    }

    protected static class SourcePojoSerializer implements JsonSerializer<SourcePojo> {
        @Override
        public JsonElement serialize(SourcePojo src, Type typeOfT, JsonSerializationContext context)
                throws JsonParseException {
            if (null != src.extractorOptions) {
                src.extractorOptions = encodeKeysForDatabaseStorage(src.extractorOptions);
            }
            if (null != src.processingPipeline) {
                for (SourcePipelinePojo pxPipe : src.processingPipeline) {
                    if ((null != pxPipe.web) || (null != pxPipe.feed)) {
                        SourceRssConfigPojo webOrFeed = (null != pxPipe.web) ? pxPipe.web : pxPipe.feed;
                        if (null != webOrFeed.getHttpFields()) {
                            webOrFeed.setHttpFields(encodeKeysForDatabaseStorage(webOrFeed.getHttpFields()));
                        }
                    } //TESTED (added httpFields by hand)
                      // (don't do lookup tables, "."s aren't allowed in their keys)
                    if ((null != pxPipe.featureEngine) && (null != pxPipe.featureEngine.engineConfig)) {
                        pxPipe.featureEngine.engineConfig = encodeKeysForDatabaseStorage(
                                pxPipe.featureEngine.engineConfig);
                    } //TESTED (basic_web_test_ocOptions)
                    if ((null != pxPipe.textEngine) && (null != pxPipe.textEngine.engineConfig)) {
                        pxPipe.textEngine.engineConfig = encodeKeysForDatabaseStorage(
                                pxPipe.textEngine.engineConfig);
                    } //TESTED (c/p basic_web_test_ocOptions)
                }
            }
            // GSON transformation:
            JsonElement je = SourcePojo.getDefaultBuilder().create().toJsonTree(src, typeOfT);

            return je;
        }//TESTED (with and without extractor options)
    }
    // Utilities for handling processing pipeline

    // Decode/Encode utilities

    private static LinkedHashMap<String, String> decodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
        LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
        for (Map.Entry<String, String> entry : in.entrySet()) {
            transformed.put(entry.getKey().replace("%2e", "."), entry.getValue());
        }
        return transformed;
    }//TESTED (legacy)

    private static LinkedHashMap<String, String> encodeKeysForDatabaseStorage(LinkedHashMap<String, String> in) {
        LinkedHashMap<String, String> transformed = new LinkedHashMap<String, String>();
        for (Map.Entry<String, String> entry : in.entrySet()) {
            transformed.put(entry.getKey().replace(".", "%2e"), entry.getValue());
        }
        return transformed;
    }//TESTED (legacy)

    //(ugh need to store this logstash-domain-specific information here, might need to update it from time to time buy should remain reasonably simple)
    private static Pattern _getLogstashUrlRegex = Pattern.compile(
            "(?:bucket|host|url|uri|path)[\\s\\n\\r]*=>[\\s\\n\\r]*['\"]([^'\"]+)", Pattern.CASE_INSENSITIVE);

    public String getRepresentativeUrl() {
        if (null == this.getProcessingPipeline()) {
            if (null != this.getUrl()) {
                return this.getUrl();
            } else if ((null != this.getRssConfig()) && (null != this.getRssConfig().getExtraUrls())
                    && !this.getRssConfig().getExtraUrls().isEmpty()) {
                return this.getRssConfig().getExtraUrls().get(0).url;
            }
        } else if (!this.getProcessingPipeline().isEmpty()) {
            SourcePipelinePojo px = this.getProcessingPipeline().get(0);
            if (null != px.file) {
                return px.file.getUrl();
            } else if (null != px.database) {
                return px.database.getUrl();
            } else if (null != px.federatedQuery) {
                if ((null != px.federatedQuery.requests) && !px.federatedQuery.requests.isEmpty()) {
                    return px.federatedQuery.requests.iterator().next().endPointUrl;
                } else {
                    if ((null != px.federatedQuery.entityTypes) && !px.federatedQuery.entityTypes.isEmpty()) {
                        return "inf://federated/" + Arrays.toString(px.federatedQuery.entityTypes.toArray())
                                .replaceAll("[\\[\\]]", "");
                    } else if (null != px.federatedQuery.importScript) {
                        return "inf://federated/" + px.federatedQuery.scriptlang + "/"
                                + px.federatedQuery.importScript.hashCode();
                    } else {
                        return "inf://federated/unknown/";
                    }
                }
            } else if (null != px.logstash) {
                String url = null;
                try {
                    Matcher m1 = _getLogstashUrlRegex.matcher(px.logstash.config);
                    if (m1.find()) { // (get the first)
                        url = m1.group(1);
                    }
                } catch (Exception e) {
                } // return null will error out
                return url;
            }
            // ALL THE DISTRIBUTED CASES
            else if (null != px.postProcessing) { // just use the title, gets a bit complex otherwise
                return "inf://docs/proc/" + this.title.replaceAll("\\s+", "_");
            } else if ((null != px.docs_datastoreQuery) || (null != px.docs_documentQuery)) {
                return "inf://proc/doc/" + this.title.replaceAll("\\s+", "_");
            } else if (null != px.custom_file) {
                return "inf://proc/hdfs/" + this.title.replaceAll("\\s+", "_");
            } else if (null != px.custom_datastoreQuery) {
                return "inf://proc/custom/" + this.title.replaceAll("\\s+", "_");
            } else if (null != px.records_indexQuery) {
                return "inf://proc/records/" + this.title.replaceAll("\\s+", "_");
            } else if (null != px.feature_datastoreQuery) {
                return "inf://proc/feature/" + this.title.replaceAll("\\s+", "_");
            }
            //(END DISTRIBUTED CASES)
            else {
                SourceRssConfigPojo webOrFeed = px.feed;
                if (null == webOrFeed) {
                    webOrFeed = px.web;
                }
                if ((null != webOrFeed) && (null != webOrFeed.getExtraUrls())
                        && !webOrFeed.getExtraUrls().isEmpty()) {
                    return webOrFeed.getExtraUrls().get(0).url;
                }
            }
        }
        return null;
    }//TESTED (legacy+basic_web_test_ocOptions)

    public void fillInSourcePipelineFields() {
        // Note the extract type code is "sort of" duplicated in the HarvestControllerPipeline.extractSource_preProcessingPipeline code
        if (null != this.getProcessingPipeline()) {
            this.extractType = null; // always derive from the px pipeline, ignore user input

            for (SourcePipelinePojo px : this.getProcessingPipeline()) {
                if (null != px.file) {
                    this.extractType = "File";
                } else if (null != px.database) {
                    this.extractType = "Database";
                } else if (null != px.logstash) {
                    this.extractType = "Logstash";
                } else if ((null != px.web) || (null != px.feed)) {
                    this.extractType = "Feed";
                } else if (null != px.federatedQuery) {
                    this.extractType = "Federated";
                    this.federatedQueryCommunityIds = this.communityIds;
                } else if (null != px.postProcessing) {
                    this.extractType = "Post_processing";
                } else if ((null != px.docs_datastoreQuery) || (null != px.docs_documentQuery)
                        || (null != px.custom_file) || (null != px.custom_datastoreQuery)
                        || (null != px.records_indexQuery) || (null != px.feature_datastoreQuery)) {
                    this.extractType = "Custom";
                }

                if (null != px.harvest) {
                    if (null != px.harvest.distributionFactor) {
                        distributionFactor = px.harvest.distributionFactor;
                    } //TESTED
                    if (null != px.harvest.searchCycle_secs) {
                        if ((null == searchCycle_secs) || (searchCycle_secs >= 0)) {
                            searchCycle_secs = Math.abs(px.harvest.searchCycle_secs);
                        } else { // (searchCycle_secs < 0 ie want to suspend source)
                            if (0 == px.harvest.searchCycle_secs) { // (0 == run once and then suspend) 
                                searchCycle_secs = -1;
                            } else {
                                searchCycle_secs = -Math.abs(px.harvest.searchCycle_secs);
                            }
                        }
                    } //TESTED
                    else if ((null != searchCycle_secs) && (searchCycle_secs < 0)) {
                        // No search cycle specfiied, source suspended
                        searchCycle_secs = -1;
                    } //TESTED
                    else { // No search cycle specified and source not suspended
                        searchCycle_secs = null;
                    } //TESTED
                    break;
                }
            }
        } //TESTED      
    }

    public Boolean getPartiallyPublished() {
        return partiallyPublished;
    }

    public void setPartiallyPublished(Boolean partiallyPublished) {
        this.partiallyPublished = partiallyPublished;
    }

    public Set<ObjectId> getFederatedQueryCommunityIds() {
        return federatedQueryCommunityIds;
    }

    public void setFederatedQueryCommunityIds(Set<ObjectId> federatedQueryCommunityIds) {
        this.federatedQueryCommunityIds = federatedQueryCommunityIds;
    }

    public BasicDBObject getTemplateProcessingFlow() {
        return templateProcessingFlow;
    }

    public void setTemplateProcessingFlow(BasicDBObject templateProcessingFlow) {
        this.templateProcessingFlow = templateProcessingFlow;
    }

    public Boolean getOwnedByAdmin() {
        return ownedByAdmin;
    }

    public void setOwnedByAdmin(Boolean ownedByAdmin) {
        this.ownedByAdmin = ownedByAdmin;
    }

    public Integer getTimeToLive_days() {
        return timeToLive_days;
    }

    public void setTimeToLive_days(Integer timeToLive_days) {
        this.timeToLive_days = timeToLive_days;
    }

    public Integer getHighestDistributionFactorStored() {
        return highestDistributionFactorStored;
    }

    public void setHighestDistributionFactorStored(Integer highestDistributionFactorStored) {
        _distributedKeys = null;
        _distributedKeyQueryTerm = null;
        this.highestDistributionFactorStored = highestDistributionFactorStored;
    }
}