Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ /** * */ package com.ikanow.infinit.e.harvest.extraction.document; import java.util.Date; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.apache.log4j.Logger; import org.bson.types.ObjectId; import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.ReadPreference; /** * @author cmorgan * */ public class DuplicateManager_Integrated implements DuplicateManager { @SuppressWarnings("unused") private static final Logger logger = Logger.getLogger(DuplicateManager_Integrated.class); ///////////////////////////////////////////////////////////////////////////////////////////////////////// // DUPLICATION LOGIC // If a URL already exists... // 1] For the same source: the document is a duplicate // 2] For a different source, with the same "configuration hash": Set that source as the "duplicateFrom" field // in the DocumentPojo which in the HarvestController will result in the DocumentPojo's being cloned from that {url,source} pair // 3] For a different source, with a different "configuration hash": Proceed as if URL didn't already exist // Some implementation specifics: // The duplicate manager will store knowledge gained about other sources (eg whether they fall into [2] or [3]) for a given source. private Set<String> _sameConfigurationSources = null; private Set<String> _differentConfigurationSources = null; private Set<String> _sameCommunitySources = null; private boolean _bCalculatedMostRecentlyModifiedFile = false; private Date _mostRecentlyModifiedFile = null; private ObjectId _mostRecentlyModifiedDocId = null; private int _replicaSetDistributionRatio = -1; ///////////////////////////////////////////////////////////////////////////////////////////////////////// // INTERFACE // Resets source-specific state public void resetForNewSource() { _sameConfigurationSources = null; _differentConfigurationSources = null; _sameCommunitySources = null; _bCalculatedMostRecentlyModifiedFile = false; _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; com.ikanow.infinit.e.data_model.utils.PropertiesManager dataModelProps = new com.ikanow.infinit.e.data_model.utils.PropertiesManager(); _replicaSetDistributionRatio = 1 + dataModelProps.getDocDbReadDistributionRatio(); } /** * Tests to see if duplicates exist based on defined key * * @param title - currently ignored * @param description - currently ignored * @param duplicateSources - list of sources containing a duplicate URL, filled in transiently by calls to this function * @param key * @return boolean (true/false) */ public boolean isDuplicate_UrlTitleDescription(String url, String title, String description, SourcePojo source, List<String> duplicateSources) { // Removing title/desc match for now, we mandate that for a given source key there be a unique URL return isDuplicate_Url(url, source, duplicateSources); // TODO (INF-1890): Actually for RSS, changes in title/desc/pub date should result in updates occurring //BasicDBObject orQuery1 = new BasicDBObject(DocumentPojo.title_, title); //BasicDBObject orQuery2 = new BasicDBObject(DocumentPojo.description_, description); //query.put(MongoDbManager.or_, Arrays.asList(orQuery1, orQuery2)); } /** * Tests to see if duplicates exist based on defined key * * @param duplicateSources - list of sources containing a duplicate URL, filled in transiently by calls to this function * @return boolean (true/false) */ public boolean isDuplicate_Url(String url, SourcePojo source, List<String> duplicateSources) { BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, url); if (null != duplicateSources) { return duplicationLogic(query, source, duplicateSources); } else { query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject fields = new BasicDBObject(DocumentPojo._id_, 1); return null != MongoDbManager.getDocument().getMetadata().findOne(query, fields); } } public ObjectId getLastDuplicateId() { return _duplicateId; } public Date getLastDuplicateModifiedTime() { return _modifiedTimeOfActualDuplicate; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // File handling specific logic /** * Tests to see if duplicates might exist. * If it is not a duplicate, true is returned. If it is a duplicate, * the modified date is then checked to see if the file has been updated. * True is returned if the file has been updated, false otherwise. * * @param collection * @param modifiedDate * @param url * @param title * @return boolean (true/false) */ public boolean needsUpdated_SourceUrl(Date modifiedDate, String sourceUrl, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try { if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); if (null != source.getDistributionFactor()) { // This is a slightly more complex case because other... //...threads for this source could be writing documents asynchronously ... so we're just going to disable everything //if the most recent doc is _after_ our last harvest time (since this means we've already started harvesting the new source) Date mostRecentlyModifedFile_createdTime = (Date) mostRecentDocDbo .get(DocumentPojo.created_); if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvested() && (null != mostRecentlyModifedFile_createdTime))) { if (mostRecentlyModifedFile_createdTime .after(source.getHarvestStatus().getHarvested())) { _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } else { // If we don't have a date then force a "slow" dedup _mostRecentlyModifiedFile = null; _mostRecentlyModifiedDocId = null; } } //TESTED } //(found docs) } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED else if (null == sourceUrl) { return true; // (for custom checking - if we couldn't get a cached value to compare against then assume we are inspecting) } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.sourceUrl_, sourceUrl); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject hint = new BasicDBObject(DocumentPojo.sourceUrl_, 2); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).hint(hint).limit(1); // (this should be very fast since sourceUrl is indexed ... order doesn't matter as all docs should have the same modified) //TODO (INF-1922): at some point should look into making (sparse) sourceUrl be compounded with sourceKey - this is a bit risky if (!dbc.hasNext()) { //if there is no record, return true ret = true; modifiedDate.setTime(0); } else { // (all docs should have same modified, though this is ~ time ordered anyway) BasicDBObject dbo = (BasicDBObject) dbc.iterator().next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); ret = ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)); // ie if different -> true -> update docs from sourceUrl // ^^ note granularity seems only to be guaranteed to 1s somewhere in the system (not sure where) // (this is just backwards compatible for a deployment where this has happened for some % -probably 100- of the docs // once an RPM >=5955 is deployed this will no longer be necessary) } return ret; }//TESTED public boolean needsUpdated_Url(Date modifiedDate, String url, SourcePojo source) { // Performance shortcut: if (!_bCalculatedMostRecentlyModifiedFile) { _bCalculatedMostRecentlyModifiedFile = true; // Get date of most recently modified file: try { if ((null != source.getHarvestStatus()) && (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { BasicDBObject mostRecentQuery = new BasicDBObject(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); if (null != source.getDistributionFactor()) { // if distributed, then apply extra term if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getDistributedLastCompletedCycle())) { Date d = source.getHarvestStatus().getDistributedLastCompletedCycle(); mostRecentQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.lte_, new ObjectId(d))); } } //TESTED BasicDBObject mostRecentSort = new BasicDBObject(DocumentPojo._id_, -1); BasicDBObject mostRecentFields = new BasicDBObject(DocumentPojo.modified_, 1); if (null != source.getDistributionFactor()) { // (need the created date also mostRecentFields.put(DocumentPojo.created_, 1); } DBCursor mostRecentDocs = MongoDbManager.getDocument().getMetadata() .find(mostRecentQuery, mostRecentFields).sort(mostRecentSort).limit(1); if (mostRecentDocs.hasNext()) { BasicDBObject mostRecentDocDbo = (BasicDBObject) mostRecentDocs.next(); _mostRecentlyModifiedFile = (Date) mostRecentDocDbo.get(DocumentPojo.modified_); _mostRecentlyModifiedDocId = (ObjectId) mostRecentDocDbo.get(DocumentPojo._id_); } //TESTED (found docs) //DEBUG //if (null != _mostRecentlyModifiedDocId) // System.out.println("DEDUP: " + mostRecentQuery + ": RESULTS IN " + new Date(_mostRecentlyModifiedDocId.getTime())); } //(success mode) } catch (Exception e) { } // If anything goes wrong will just check all files (slower) } //TESTED if (null != _mostRecentlyModifiedFile) { // Use short cut... long nMostRecentlyModifiedTime = _mostRecentlyModifiedFile.getTime() / 1000L; long nFileTime = modifiedDate.getTime() / 1000L; if (nFileTime <= nMostRecentlyModifiedTime) { return false; } } //TESTED if (null == url) { // use this call with url==null to just check the modified file... return true; } // No short cut, go the long way round: DBCollection collection = DbManager.getDocument().getMetadata(); boolean ret = true; BasicDBObject query = new BasicDBObject(); query.put(DocumentPojo.url_, url); query.put(DocumentPojo.sourceKey_, source.getDistributedKeyQueryTerm()); BasicDBObject fields = new BasicDBObject(DocumentPojo.modified_, 1); DBCursor dbc = collection.find(query, fields).limit(2); // (will normally return 0 or 1) boolean foundMatch = dbc.hasNext(); if (!foundMatch) { //if there is no record, return true ret = true; } else { BasicDBObject dbo = (BasicDBObject) dbc.next(); Date oldModified = (Date) dbo.get(DocumentPojo.modified_); if ((modifiedDate.getTime() / 1000) != (oldModified.getTime() / 1000)) { // times don't match if (!dbc.hasNext()) { // 1 matching doc, different modified times so update ret = true; } //TESTED else { // Not sure about this case, multiple docs, are any of them the same? (Shouldn't ever occur) // (slightly slow but should be OK because not going to happen very often) int nCount = dbc.count(); query.put(DocumentPojo.modified_, modifiedDate); ret = !(collection.find(query).limit(1).count() == nCount); } //TOTEST (shouldn't ever occur) } else { // Doc has same modified time so don't update ret = false; } //TESTED } return ret; }//TOTEST ///////////////////////////////////////////////////////////////////////////////////////////////////////// // UTILITY // Top level utility function to handle duplicate logic boolean duplicationLogic(BasicDBObject query, SourcePojo source, List<String> duplicateSources) { duplicateSources.clear(); boolean bUpdate = false; if ((null != source.getRssConfig()) && (null != source.getRssConfig().getUpdateCycle_secs())) { //RSS and there's a means of updating bUpdate = true; } else if (null != source.getFileConfig()) { // File and we're processing XML, normally >1 /file (else just waste some CPU cycles anyway) bUpdate = true; } //TESTEDx2 // TODO (INF-1300): (Leave databases alone until update functionality is implemented, then check if is enabled) LinkedList<String> possibleDups = getCandidateDuplicates(query, source.getKey(), bUpdate); if (!possibleDups.isEmpty()) { String definiteDup = isFunctionalDuplicate(source, possibleDups); if (null == definiteDup) { return false; } //TESTED else if (definiteDup.equalsIgnoreCase(source.getKey())) { return true; } //TESTED else { duplicateSources.add(definiteDup); return false; } //TESTED } else { return false; // Definitely not a duplicate } //TESTED }//TESTED (created different types of duplicate, put print statements in, tested by hand) // Utility function to take DB query and return key information from matching documents private Date _modifiedTimeOfActualDuplicate = null; // (if we have a pure 1-1 duplicate, store its modified time) private ObjectId _duplicateId = null; // (if we have a pure 1-1 duplicate, store its _id) private LinkedList<String> getCandidateDuplicates(BasicDBObject query, String parentSourceKey, boolean bUpdate) { _modifiedTimeOfActualDuplicate = null; _duplicateId = null; LinkedList<String> returnVal = new LinkedList<String>(); DBCollection collection = DbManager.getDocument().getMetadata(); BasicDBObject fields = new BasicDBObject(DocumentPojo.sourceKey_, 1); if (bUpdate) { fields.put(DocumentPojo.modified_, 1); fields.put(DocumentPojo.updateId_, 1); } //TESTED boolean bPrimary = true; if (_replicaSetDistributionRatio > 0) { // (distribute based on source key, should ensure some reasonable cache grouping...) if (0 != (parentSourceKey.hashCode() % _replicaSetDistributionRatio)) { bPrimary = false; } } DBCursor dbc = null; if (bPrimary) { dbc = collection.find(query, fields); } else { dbc = collection.find(query, fields).setReadPreference(ReadPreference.secondaryPreferred()); } while (dbc.hasNext()) { DBObject dbo = dbc.next(); String sourceKey = DocumentPojo.getSourceKey((String) dbo.get(DocumentPojo.sourceKey_)); if (null != sourceKey) { // Check for exact duplicates, in which case can bypass horrible functional duplicate logic: boolean bFoundExactDuplicate = sourceKey.equals(parentSourceKey); // Update logic: if (bUpdate && bFoundExactDuplicate) { _modifiedTimeOfActualDuplicate = (Date) dbo.get(DocumentPojo.modified_); _duplicateId = (ObjectId) dbo.get(DocumentPojo.updateId_); if (null == _duplicateId) { // first time, use the _id _duplicateId = (ObjectId) dbo.get(DocumentPojo._id_); } } //TESTED if (bFoundExactDuplicate) { // Found exact duplicate, so return just that for performance returnVal.clear(); } returnVal.add(sourceKey); if (bFoundExactDuplicate) { // Found exact duplicate, we're done here return returnVal; } } //(if doc has source key, else is malformed, ignore) } //(end loop over URL-duplicates) return returnVal; }//TESTED (created different types of duplicate, put print statements in, tested by hand) // Utility function to return one source containing a document for which this is a duplicate // Returns null if there isn't one // Updates _sameConfigurationSources, _differentConfigurationSources, _sameCommunitySources private String isFunctionalDuplicate(SourcePojo source, LinkedList<String> candidateSourceKeys) { // (Ensure everything's set up) if (null == _sameConfigurationSources) { _sameConfigurationSources = new TreeSet<String>(); _differentConfigurationSources = new TreeSet<String>(); _sameCommunitySources = new TreeSet<String>(); } if (null == source.getShah256Hash()) { source.generateShah256Hash(); } // See if we've cached something: String returnVal = null; Iterator<String> it = candidateSourceKeys.iterator(); while (it.hasNext()) { String sourceKey = it.next(); if (!source.getDuplicateExistingUrls()) { // Check _sameCommunitySources: ignore+carry on if sourceKey isn't in here, else // return sourceKey, which will treat as a non-update duplicate (non update because // the update params only set if it was an update duplicate) if (_sameCommunitySources.contains(sourceKey)) { return source.getKey(); // (ie return fake source key that will cause above logic to occur) } } //TESTED if (sourceKey.equalsIgnoreCase(source.getKey())) { return sourceKey; // (the calling function will then treat it as a duplicate) } else if (_sameConfigurationSources.contains(sourceKey)) { returnVal = sourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive) } else if (_differentConfigurationSources.contains(sourceKey)) { it.remove(); // (don't need to check this source out) } } //TESTED boolean bMatchedInCommunity = false; // (duplication logic below) if ((null == returnVal) && !candidateSourceKeys.isEmpty()) { // Need to query the DB for this source... BasicDBObject query = new BasicDBObject(SourcePojo.shah256Hash_, source.getShah256Hash()); query.put(SourcePojo.key_, new BasicDBObject(MongoDbManager.in_, candidateSourceKeys.toArray())); BasicDBObject fields = new BasicDBObject(SourcePojo._id_, 0); fields.put(SourcePojo.key_, 1); if (!source.getDuplicateExistingUrls()) { fields.put(SourcePojo.communityIds_, 1); } DBCursor dbc = DbManager.getIngest().getSource().find(query, fields); while (dbc.hasNext()) { BasicDBObject dbo = (BasicDBObject) dbc.next(); String sSourceKey = dbo.getString(SourcePojo.key_); // DON'T DEDUP LOGIC: if (!source.getDuplicateExistingUrls()) { BasicDBList communities = (BasicDBList) dbo.get(SourcePojo.communityIds_); for (Object communityIdObj : communities) { ObjectId communityId = (ObjectId) communityIdObj; if (source.getCommunityIds().contains(communityId)) { // Not allowed to duplicate off this _sameCommunitySources.add(sSourceKey); bMatchedInCommunity = true; } } } //(end "don't duplicate existing URLs logic") //TESTED (same community and different communities) if (null != sSourceKey) { _sameConfigurationSources.add(sSourceKey); returnVal = sSourceKey; // (overwrite prev value, doesn't matter since this property is obv transitive) } } // Loop over config sources again to work out which keys can now be placed in the "_differentConfigurationSources" cache for (String sourceKey : candidateSourceKeys) { if (!_sameConfigurationSources.contains(sourceKey)) { _differentConfigurationSources.add(sourceKey); } } } //TESTED if (bMatchedInCommunity) { return source.getKey(); // (ie return fake source key that will cause above logic to occur) } else { return returnVal; } }//TESTED (created different types of duplicate, put print statements in, tested by hand) @Override public Date getLastModifiedDate() { return _mostRecentlyModifiedFile; } @Override public ObjectId getLastModifiedDocId() { return _mostRecentlyModifiedDocId; } }