com.ikanow.infinit.e.data_model.store.document.DocumentPojo.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.infinit.e.data_model.store.document.DocumentPojo.java

Source

/*******************************************************************************
 * Copyright 2012 The Infinit.e Open Source Project
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
/**
 * 
 */
package com.ikanow.infinit.e.data_model.store.document;

import java.lang.reflect.Type;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.commons.lang.ArrayUtils;
import org.bson.types.ObjectId;

import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParseException;
import com.google.gson.JsonSerializationContext;
import com.google.gson.JsonSerializer;
import com.google.gson.reflect.TypeToken;
import com.ikanow.infinit.e.data_model.store.BaseDbPojo;
import com.ikanow.infinit.e.data_model.store.MongoDbUtil;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.mongodb.BasicDBList;

/**
 * @author apiggott
 * The generic document data model
 */
public class DocumentPojo extends BaseDbPojo {
    // Standard static function for readability
    @SuppressWarnings("unchecked")
    static public TypeToken<List<DocumentPojo>> listType() {
        return new TypeToken<List<DocumentPojo>>() {
        };
    }

    //*** IMPORTANT: don't add to this list without considering the ES mapping in DocumentIndexPojoMap

    // Storage (Mongo) data model
    // API data model is the same except where otherwise specified (DocumentApiPojoMap converts)
    // For index data model see DocumentIndexPojoMap

    ////////////////////////////////////////////////////////////////////////////////   

    // Stored Fields:

    // Basic metadata
    private ObjectId _id = null;
    final public static String _id_ = "_id";
    // (API-side, this is an immutable id for the doc, DB-side this the DB _id and changes with every update)
    private ObjectId updateId = null;
    final public static String updateId_ = "updateId";
    // (API-side, this is the current DB id, DB-side this is the original _id, or null if this doc has never been updated)
    private String title = null;
    final public static String title_ = "title";
    private String url = null;
    final public static String url_ = "url";
    private Date created = null;
    final public static String created_ = "created";
    private Date modified = null;
    final public static String modified_ = "modified";
    private Date publishedDate = null;
    final public static String publishedDate_ = "publishedDate";

    // Data source
    private String source = null; // (API side is Set<String>)
    final public static String source_ = "source";
    private String sourceKey = null; // (API side is Set<String>) Internally may include #N or #NN to help with distribution
    final public static String sourceKey_ = "sourceKey";
    private String mediaType = null; // (API side is Set<String>)
    final public static String mediaType_ = "mediaType";
    transient String sourceType = null; //feed, db, or filesys   
    final public static String sourceType_ = "sourceType";

    // Content
    private String description = null;
    final public static String description_ = "description";
    // Enriched content
    private List<EntityPojo> entities = null;
    final public static String entities_ = "entities";
    // (moved metadata to beta because of wholesale changes)

    // Data source/Content
    private Set<String> tags = null;
    final public static String tags_ = "tags";
    private String displayUrl = null;
    final public static String displayUrl_ = "displayUrl";

    // Data source
    private ObjectId communityId = null;
    final public static String communityId_ = "communityId";
    // (note as far as the API is concerned this a Set<String>)

    //currently only used for xml files
    private String sourceUrl = null;
    final public static String sourceUrl_ = "sourceUrl";

    // Enriched content
    private List<AssociationPojo> associations = null;
    final public static String associations_ = "associations";
    private LinkedHashMap<String, Object[]> metadata = null; // has to be [] to allow for 1+  
    final public static String metadata_ = "metadata";
    private GeoPojo docGeo = null; // holds the location of the document, if it has one separate to its entities and events
    final public static String docGeo_ = "docGeo";

    // Mongo/Elasticsearch-specific field
    private String index = null; // The name of the index to which the feed's been added
    final public static String index_ = "index";

    // Only used for query responses
    private Object explain = null;
    final public static String explain_ = "explain";

    /////////////////////////////////////////////////////////////////////////////////////////////////   

    // The following won't be stored in the DB (either created by index map or transient)

    // Alpha unstored (eg index or API fields)

    // Content
    private String fullText = null;
    final public static String fullText_ = "fullText";

    // Per query (transient, created on the way to the API for query, not currently stored anywhere)

    private Double aggregateSignif; // The document significance normalized against Lucene relevance 
    final public static String aggregateSignif_ = "aggregateSignif";
    private Double queryRelevance; // The Lucene relevance normalized against Infinit.e significance
    final public static String queryRelevance_ = "queryRelevance";
    private Double score; // The combined scores (vs the query weighting)   
    final public static String score_ = "score";

    // Alpha transient:

    private transient String tmpFullText = null; // (temporary storage until obj written to MongoDB)
    private transient String rawFullText = null; // (stores a pointer to the first full text set, ie normally directly from URL/file)

    // Beta unstored (eg index or API fields)

    // Index-specific fields (ElasticSearch):
    private Set<String> locs = null;
    final public static String locs_ = "locs";

    @SuppressWarnings("unused")
    private List<GeoPojo> timeRanges = null; // (won't be used for beta - allow encapsulation of time ranges as 2d points)
    final public static String timeRanges_ = "timeRanges";
    private Set<Integer> months = null; // (dates represented as YYYYMM - used to generate histograms, nothing else)
    final public static String months_ = "months";

    // Beta transient:

    private transient SourcePojo _source = null; // (handy accessor for the "parent" source info)

    //header & Footer Data (doesn't persist in the DB - used for extraction and enrichment)
    private transient int headerEndIndex = 0; // (obv starts at 0)
    private transient int footerStartIndex = Integer.MAX_VALUE; // (obv ends at the end of the document)
    private transient Set<String> headerFields = null;
    private transient Set<String> footerFields = null;
    private transient String headerText = null; // (\n-separated list of headerFields)
    private transient String footerText = null; // (\n-separated list of headerFields)

    // V0 transient

    // multi-community/source handling
    private transient String duplicateFrom = null; // Indicates this document should be cloned from the DB entry with matching URL, "duplicateFrom" source
    private transient DocumentPojo cloneFrom = null; // Indicate this document should be cloned from the "cloneFrom" in memory copy after enrichment 
    private transient SourcePipelinePojo spawnedFrom = null; // Indicates this document was spawned from a "document splitter" (so should ignore previous pipeline elements)
    private transient boolean hasDefaultUrl = false; // (for files only) if true then can skip an extra dedup step  

    ////////////////////////////////////////////////////////////////////////////////

    // Alpha gets and sets   

    public DocumentPojo() {
    }

    public ObjectId getId() {
        return _id;
    }

    public void setId(ObjectId _id) {
        this._id = _id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    /**
     * @param created the created to set
     */
    public Date getCreated() {
        return this.created;
    }

    public void setCreated(Date created) {
        this.created = created;
    }

    public Date getModified() {
        return this.modified;
    }

    public void setModified(Date modified) {
        this.modified = modified;
    }

    public Date getPublishedDate() {
        return this.publishedDate;
    }

    public void setPublishedDate(Date publishedDate) {
        this.publishedDate = publishedDate;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public String getRawSourceKey() { // Including the #<N|NN,distributed> number at the end
        return sourceKey;
    }

    public static String getSourceKey(String rawSourceKey) { // (Removes the # #<N|NN,distributed> number at the end)
        if (null == rawSourceKey) {
            return null;
        } else {
            int len = rawSourceKey.length();
            if ('#' == rawSourceKey.charAt(len - 2)) {
                return rawSourceKey.substring(0, len - 2);
            }
            if ('#' == rawSourceKey.charAt(len - 3)) {
                return rawSourceKey.substring(0, len - 3);
            }
            return rawSourceKey;
        }

    }

    public String getSourceKey() { // (Removes the # #<N|NN,distributed> number at the end)
        return getSourceKey(sourceKey);
    }

    public void setSourceKey(String sourceKey) {
        this.sourceKey = sourceKey;
    }

    public void setEntities(List<EntityPojo> entities) {
        this.entities = entities;
    }

    public List<EntityPojo> getEntities() {
        return entities;
    }

    public String getMediaType() {
        return mediaType;
    }

    public void setMediaType(String mediaType) {
        this.mediaType = mediaType;
    }

    public String getFullText() {
        return (null == fullText) ? tmpFullText : fullText;
    }

    public void setFullText(String fullText) {
        if (null == this.rawFullText) { // very first time, set the raw full text
            rawFullText = fullText;
        }
        this.tmpFullText = fullText;
    }

    public void makeFullTextNonTransient() {
        this.fullText = this.tmpFullText;
    }

    // This is used for convenience, also used as a hacky flag to spot update documents
    // that have been discarded from the update list.
    public SourcePojo getTempSource() {
        return _source;
    }

    public void setTempSource(SourcePojo tempSource) {
        _source = tempSource;
    }

    ////////////////////////////////////////////////////////////////////////////////

    // Alpha utility   

    ////////////////////////////////////////////////////////////////////////////////

    // Beta gets and sets

    public void setAssociations(List<AssociationPojo> events) {
        this.associations = events;
    }

    public List<AssociationPojo> getAssociations() {
        return this.associations;
    }

    public void addToMetadata(String fieldName, Object fieldVal) {
        if (null == metadata) {
            metadata = new LinkedHashMap<String, Object[]>();
        }
        Object obj[] = new Object[1];
        obj[0] = fieldVal;
        Object[] current = metadata.get(fieldName);
        if (null != current) {
            metadata.put(fieldName, ArrayUtils.add(current, obj));
        } else {
            metadata.put(fieldName, obj);
        }
    }

    public void addToMetadata(String fieldName, Object[] fieldVals) {
        if (null == metadata) {
            metadata = new LinkedHashMap<String, Object[]>();
        }
        Object[] current = metadata.get(fieldName);
        if (null != current) {
            metadata.put(fieldName, ArrayUtils.addAll(current, fieldVals));
        } else {
            metadata.put(fieldName, fieldVals);
        }
    }

    public void setMetadata(LinkedHashMap<String, Object[]> metadata) {
        this.metadata = metadata;
    }

    public LinkedHashMap<String, Object[]> getMetadata() {
        return this.metadata;
    }

    public LinkedHashMap<String, Object[]> getMetaData() {
        return metadata;
    }

    public Set<String> getTags() {
        return tags;
    }

    public void setTags(Set<String> tags_) {
        tags = tags_;
    }

    public void addTags(Set<String> tags_) {
        tags.addAll(tags_);
    }

    public void setCommunityId(ObjectId communityId) {
        this.communityId = communityId;
    }

    public ObjectId getCommunityId() {
        return this.communityId;
    }

    public GeoPojo getDocGeo() {
        return docGeo;
    }

    public void setDocGeo(GeoPojo docGeo) {
        this.docGeo = GeoPojo.cleanseBadGeotag(docGeo);
    }

    /**
     * @param locs the locs to set
     */
    public void setLocs(Set<String> locs) {
        this.locs = locs;
    }

    /**
     * @return the locs
     */
    public Set<String> getLocs() {
        return locs;
    }

    /**
     * @param months the months to set
     */
    public void setMonths(Set<Integer> months) {
        this.months = months;
    }

    /**
     * @return the months
     */
    public Set<Integer> getMonths() {
        return months;
    }

    /**
     * @param sourceUrl the sourceUrl to set
     */
    public void setSourceUrl(String sourceUrl) {
        this.sourceUrl = sourceUrl;
    }

    /**
     * @return the sourceUrl
     */
    public String getSourceUrl() {
        return sourceUrl;
    }

    /**
     * @return the index
     */
    public String getIndex() {
        return index;
    }

    /**
     * @param index the index to set
     */
    public void setIndex(String index) {
        this.index = index;
    }

    ////////////////////////////////////////////////////////////////////////////////

    // Beta utility

    // Add the metadata as separate lines to perform extraction on them

    public String metaDataToText() {
        StringBuffer sb = new StringBuffer();
        for (Object md : metadata.values()) {
            sb.append(md).append('\n');
        }
        return sb.toString();
    }//TOTEST - to be done during DB integration

    ////////////////////////////////////////////////////////////////////////////////

    //(Still beta) Header Footer Stuff ... can be used by entity extractors

    /**
     * @return the headerStartIndex
     */
    @Deprecated
    public int getHeaderEndIndex() {
        return headerEndIndex;
    }

    /**
     * @param headerStartIndex the headerStartIndex to set
     */
    @Deprecated
    public void setHeaderEndIndex(int headerEndIndex) {
        this.headerEndIndex = headerEndIndex;
    }

    /**
     * @return the footerStartIndex
     */
    @Deprecated
    public int getFooterStartIndex() {
        return footerStartIndex;
    }

    /**
     * @param footerEndIndex the footerEndIndex to set
     */
    @Deprecated
    public void setFooterStartIndex(int footerStartIndex) {
        this.footerStartIndex = footerStartIndex;
    }

    @Deprecated
    public void addToHeader(String sHeaderField) {
        if (headerFields == null)
            headerFields = new HashSet<String>();
        headerFields.add(sHeaderField.toLowerCase());
    }

    @Deprecated
    public void addToFooter(String sFooterField) {
        if (footerFields == null)
            footerFields = new HashSet<String>();
        footerFields.add(sFooterField.toLowerCase());
    }

    @Deprecated
    public Set<String> getHeaderFields() {
        return headerFields;
    }

    @Deprecated
    public Set<String> getFooterFields() {
        return footerFields;
    }

    @Deprecated
    public String getHeader() {
        if (null == headerFields) {
            return "";
        }
        return headerText;
    }

    @Deprecated
    public String getFooter() {
        if (null == footerFields) {
            return "";
        }
        return footerText;
    }

    @Deprecated
    public String getBody() {
        if (null == getFullText()) {
            return null;
        } else {
            if (footerStartIndex == Integer.MAX_VALUE && headerEndIndex == 0) {
                return getFullText();
            } else if (footerStartIndex > getFullText().length()) {
                return getFullText().substring(headerEndIndex);
            } else {
                return getFullText().substring(headerEndIndex, footerStartIndex);
            }
        }
    }

    ////////////////////////////////////////////////////////////////////////////////

    // V0 gets and sets

    public void setDuplicateFrom(String sourceKey) {
        duplicateFrom = sourceKey;
    }

    public String getDuplicateFrom() {
        return duplicateFrom;
    }

    public void setCloneFrom(DocumentPojo masterClone) {
        cloneFrom = masterClone;
    }

    public DocumentPojo getCloneFrom() {
        return cloneFrom;
    }

    ////////////////////////////////////////////////////////////////////////////////
    ////////////////////////////////////////////////////////////////////////////////

    // Base overrides:

    public GsonBuilder extendBuilder(GsonBuilder gp) {
        return gp.registerTypeAdapter(DocumentPojo.class, new DocumentPojoDeserializer())
                .registerTypeAdapter(DocumentPojo.class, new DocumentPojoSerializer());
    }

    protected static class DocumentPojoSerializer implements JsonSerializer<DocumentPojo> {
        @Override
        public JsonElement serialize(DocumentPojo doc, Type typeOfT, JsonSerializationContext context) {
            // GSON transformation:
            JsonElement je = DocumentPojo.getDefaultBuilder().create().toJsonTree(doc, typeOfT);

            // Convert object names in metadata
            if ((null != doc.getMetadata()) && !doc.getMetadata().isEmpty()) {
                if (je.isJsonObject()) {
                    JsonElement metadata = je.getAsJsonObject().get("metadata");
                    if (null != metadata) {
                        enforceTypeNamingPolicy(metadata, 0);
                    }
                }
            }
            return je;
        }
        //////////////////////////////////////////////////////////////////////////////////////////

        // Utility function for encoding "."s and "%"s (also duplicate in index)

        private static boolean enforceTypeNamingPolicy(JsonElement je, int nDepth) {

            if (je.isJsonPrimitive()) {
                return false; // Done
            } else if (je.isJsonArray()) {
                JsonArray ja = je.getAsJsonArray();
                if (0 == ja.size()) {
                    return false; // No idea, carry on
                }
                JsonElement jaje = ja.get(0);
                return enforceTypeNamingPolicy(jaje, nDepth + 1); // keep going until you find primitive/object
            } else if (je.isJsonObject()) {
                JsonObject jo = je.getAsJsonObject();
                // Nested variables:
                Iterator<Entry<String, JsonElement>> it = jo.entrySet().iterator();
                Map<String, JsonElement> toFixList = null;
                while (it.hasNext()) {
                    boolean bFix = false;
                    Entry<String, JsonElement> el = it.next();
                    String currKey = el.getKey();

                    if ((currKey.indexOf('.') >= 0) || (currKey.indexOf('%') >= 0)) {
                        it.remove();
                        currKey = currKey.replace("%", "%25").replace(".", "%2e");
                        bFix = true;
                    }
                    if (null == el.getValue()) {
                        if (!bFix)
                            it.remove(); // nice easy case, just get rid of it (if bFix, it's already removed)
                        bFix = false;
                    } else {
                        enforceTypeNamingPolicy(el.getValue(), nDepth + 1);
                    }
                    if (bFix) {
                        if (null == toFixList) {
                            toFixList = new HashMap<String, JsonElement>();
                        }
                        toFixList.put(currKey, el.getValue());
                    }
                } // (end loop over params)   
                if (null != toFixList) {
                    for (Entry<String, JsonElement> el : toFixList.entrySet()) {
                        jo.add(el.getKey(), el.getValue());
                    }
                }
                return true; // (in any case, I get renamed by calling parent)
            }
            return false;
        }
        //TESTED (see DOC_META in test/TestCode)
    }

    protected static class DocumentPojoDeserializer implements JsonDeserializer<DocumentPojo> {
        @Override
        public DocumentPojo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context)
                throws JsonParseException {
            JsonObject metadata = json.getAsJsonObject().getAsJsonObject("metadata");
            if (null != metadata) {
                json.getAsJsonObject().remove("metadata");
            }
            DocumentPojo doc = BaseDbPojo.getDefaultBuilder().create().fromJson(json, DocumentPojo.class);
            if (null != metadata) {
                for (Entry<String, JsonElement> entry : metadata.entrySet()) {
                    if (entry.getValue().isJsonArray()) {
                        doc.addToMetadata(entry.getKey(),
                                MongoDbUtil.encodeArray(entry.getValue().getAsJsonArray()).toArray());
                    } else {
                        BasicDBList dbl = new BasicDBList();
                        dbl.add(MongoDbUtil.encodeUnknown(entry.getValue()));
                        doc.addToMetadata(entry.getKey(), dbl);
                    }
                } //TESTED            
            }
            return doc;
        }
    }
    ////////////////////////////////////////////////////////////////////////////////   

    // Per query (transient, created on the way to the API for query, not currently stored anywhere)

    public Double getAggregateSignif() {
        return aggregateSignif;
    }

    public void setAggregateSignif(Double aggregateSignif) {
        this.aggregateSignif = aggregateSignif;
    }

    public Double getQueryRelevance() {
        return queryRelevance;
    }

    public void setQueryRelevance(Double queryRelevance) {
        this.queryRelevance = queryRelevance;
    }

    public Double getScore() {
        return score;
    }

    public void setScore(Double score) {
        this.score = score;
    }

    public void setUpdateId(ObjectId updateId) {
        this.updateId = updateId;
    }

    public ObjectId getUpdateId() {
        return updateId;
    }

    public void setDisplayUrl(String displayUrl) {
        this.displayUrl = displayUrl;
    }

    public String getDisplayUrl() {
        return displayUrl;
    }

    public void setExplain(Object explain) {
        this.explain = explain;
    }

    public Object getExplain() { // (In the harvest context this is used to tell us that the doc is actually being deleted, not just rejected, if != null)
        return explain;
    }

    public void resetRawFullText() {
        this.rawFullText = null;
    }

    public String getRawFullText() {
        return rawFullText;
    }

    public SourcePipelinePojo getSpawnedFrom() {
        return spawnedFrom;
    }

    public void setSpawnedFrom(SourcePipelinePojo spawnedFrom) {
        this.spawnedFrom = spawnedFrom;
    }

    public boolean getHasDefaultUrl() {
        return hasDefaultUrl;
    }

    public void setHasDefaultUrl(boolean hasDefaultUrl) {
        this.hasDefaultUrl = hasDefaultUrl;
    }

}