Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ /** * */ package com.ikanow.infinit.e.harvest; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.net.URL; import java.net.URLClassLoader; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import org.bson.types.ObjectId; import com.ikanow.infinit.e.data_model.Globals; import com.ikanow.infinit.e.data_model.InfiniteEnums; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelException; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelMajorException; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelTransientException; import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum; import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum; import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor; import com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.EntityPojo; import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo; import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo.ShareCommunityPojo; import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping; import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager; import com.ikanow.infinit.e.data_model.utils.TrustManagerManipulator; import com.ikanow.infinit.e.harvest.enrichment.custom.StructuredAnalysisHarvester; import com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester; import com.ikanow.infinit.e.harvest.enrichment.legacy.TextRankExtractor; import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI; import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI_Metadata; import com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais; import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager; import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated; import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Standalone; import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus; import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Integrated; import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Standalone; import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface; import com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester; import com.ikanow.infinit.e.harvest.extraction.document.distributed.DistributedHarvester; import com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester; import com.ikanow.infinit.e.harvest.extraction.document.logstash.LogstashHarvester; import com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester; import com.ikanow.infinit.e.harvest.extraction.text.boilerpipe.TextExtractorBoilerpipe; import com.ikanow.infinit.e.harvest.extraction.text.externalscript.TextExtractorExternalScript; import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika; import com.ikanow.infinit.e.harvest.utils.AuthUtils; import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils; import com.ikanow.infinit.e.harvest.utils.PropertiesManager; import com.mongodb.BasicDBObject; import com.mongodb.gridfs.GridFSDBFile; /** * @author cmorgan * * Used to process all incoming sources in the system * @param <DimensionPojo> */ public class HarvestController implements HarvestContext { private HarvestControllerPipeline procPipeline = null; private IkanowSecurityManager _securityManager = null; public IkanowSecurityManager getSecurityManager() { return _securityManager; } private PropertiesManager pm = new PropertiesManager(); private IEntityExtractor default_entity_extractor = null; private ITextExtractor default_text_extractor = null; private ArrayList<HarvesterInterface> harvesters = new ArrayList<HarvesterInterface>(); private static Set<String> urlsThatError = new TreeSet<String>(); private static final Logger logger = Logger.getLogger(HarvestController.class); private HashMap<String, IEntityExtractor> entity_extractor_mappings = null; private HashMap<String, ITextExtractor> text_extractor_mappings = null; private HashSet<String> failedDynamicExtractors = null; private static HashMap<String, Class<?>> dynamicExtractorClassCache = null; private int _nMaxDocs = Integer.MAX_VALUE; private DuplicateManager _duplicateManager = new DuplicateManager_Integrated(); private HarvestStatus _harvestStatus = new HarvestStatus_Integrated(); // (can either be standalone or integrated, defaults to standalone) public DuplicateManager getDuplicateManager() { return _duplicateManager; } public HarvestStatus getHarvestStatus() { return _harvestStatus; } boolean _bIsStandalone = false; public boolean isStandalone() { return _bIsStandalone; } public void setStandaloneMode(int nMaxDocs) { setStandaloneMode(nMaxDocs, false); // (by default don't dedup, however you may want to test updates) } public void setStandaloneMode(int nMaxDocs, boolean bRealDedup) { _bIsStandalone = true; urlsThatError.clear(); // (for api testing, obviously don't want to stop trying if we get an error) if (nMaxDocs >= 0) { _nMaxDocs = nMaxDocs; } if (!bRealDedup) { _duplicateManager = new DuplicateManager_Standalone(); } _harvestStatus = new HarvestStatus_Standalone(); if (null != dynamicExtractorClassCache) { // (standalone so don't cache extractors) dynamicExtractorClassCache.clear(); } } public int getStandaloneMaxDocs() { return _nMaxDocs; } private long nBetweenFeedDocs_ms = 10000; // (default 10s) //statistics variables private static AtomicInteger num_sources_harvested = new AtomicInteger(0); private static AtomicInteger num_docs_extracted = new AtomicInteger(0); private static AtomicInteger num_errors_source = new AtomicInteger(0); private static AtomicInteger num_error_url = new AtomicInteger(0); private static AtomicInteger num_error_px = new AtomicInteger(0); private static AtomicInteger num_ent_extracted = new AtomicInteger(0); private static AtomicInteger num_event_extracted = new AtomicInteger(0); private int nUrlErrorsThisSource = 0; /** * Used to find out the sources harvest of information is successful * @return */ public boolean isSuccessful() { return true; } // Handle clean shutdown of harvester private static boolean bIsKilled = false; public static void killHarvester() { bIsKilled = true; } public static boolean isHarvestKilled() { return bIsKilled; } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // TOP LEVEL LOGICAL // Utility objects for loading custom text and entity extractors across all threads just once @SuppressWarnings("rawtypes") private static HashMap<String, Class> customExtractors = null; private static ClassLoader customExtractorClassLoader = HarvestController.class.getClassLoader(); /** * Constructor for Harvest Controller class * * @throws IOException */ public HarvestController() throws IOException { this(false); } private static boolean _initializedSSL = false; @SuppressWarnings("rawtypes") public HarvestController(boolean overrideTypeSettings) throws IOException { if (!_initializedSSL) { _initializedSSL = true; try { // Ensure we don't have any self-signed cert debacles: TrustManagerManipulator.allowAllSSL(); } finally { } } PropertiesManager props = new PropertiesManager(); String sTypes = props.getHarvesterTypes(); if (overrideTypeSettings) { // (override API settings in test mode) sTypes = "Feed,File,Database,Logstash,Distributed,Post_processing"; //(the post_processor isn't needed for harvester testing - but is needed for actual harvesting,... //...so they're included here for consistency - custom type scheduling is set up at publish time, so it isn't needed) //(similar comments apply for logstash) } String sType[] = sTypes.split("\\s*,\\s*"); // Add a harvester for each data type for (String s : sType) { if (s.equalsIgnoreCase("distributed")) { // (custom + distributed + post processing) try { this.harvesters.add(new DistributedHarvester()); } catch (Exception e) { logger.error(s + " not supported: " + e.getMessage()); } catch (NoClassDefFoundError e) { logger.error(s + " not supported: " + e.getMessage()); } } else if (s.equalsIgnoreCase("database")) { try { this.harvesters.add(new DatabaseHarvester()); } catch (Exception e) { logger.error(s + " not supported: " + e.getMessage()); } catch (NoClassDefFoundError e) { logger.error(s + " not supported: " + e.getMessage()); } } else if (s.equalsIgnoreCase("logstash")) { try { this.harvesters.add(new LogstashHarvester()); } catch (Exception e) { logger.error(s + " not supported: " + e.getMessage()); } catch (NoClassDefFoundError e) { logger.error(s + " not supported: " + e.getMessage()); } } else if (s.equalsIgnoreCase("file")) { // According to http://www.ryanchapin.com/fv-b-4-648/java-lang-OutOfMemoryError--unable-to-create-new-native-thread-Exception-When-Using-SmbFileInputStream.html // this is needed to avoid java.lang.OutOfMemoryError (intermittent - for me at least, it's happened for exactly 1 source, but consistently when it does) System.setProperty("jcifs.resolveOrder", "DNS"); System.setProperty("jcifs.smb.client.dfs.disabled", "true"); try { this.harvesters.add(new FileHarvester()); } catch (Exception e) { logger.error(s + " not supported: " + e.getMessage()); } catch (NoClassDefFoundError e) { logger.error(s + " not supported: " + e.getMessage()); } } else if (s.equalsIgnoreCase("feed")) { try { this.harvesters.add(new FeedHarvester()); } catch (Exception e) { logger.error(s + " not supported: " + e.getMessage()); } catch (NoClassDefFoundError e) { logger.error(s + " not supported: " + e.getMessage()); } } } // Load all the extractors, set up defaults entity_extractor_mappings = new HashMap<String, IEntityExtractor>(); text_extractor_mappings = new HashMap<String, ITextExtractor>(); // Load custom text/entity extractors synchronized (HarvestController.class) { if (null == customExtractors) { customExtractors = new HashMap<String, Class>(); customExtractorClassLoader = HarvestController.class.getClassLoader(); } // Text extractors: String customTextList = props.getCustomTextExtractors(); if (null != customTextList) { String customTextArray[] = customTextList.split("\\s*,\\s*"); for (String customText : customTextArray) { if (!customExtractors.containsKey(customText)) { // (else already have this extractor) try { Class customTextExtractor = customExtractorClassLoader.loadClass(customText); ITextExtractor obj = (ITextExtractor) customTextExtractor.newInstance(); text_extractor_mappings.put(obj.getName().toLowerCase(), obj); customExtractors.put(customText, customTextExtractor); } catch (Exception e) { logger.error("ITextExtractor: Couldn't load " + customText + ": " + e.getMessage(), e); } catch (NoClassDefFoundError e) { logger.error("ITextExtractor: Couldn't load " + customText + ": " + e.getMessage(), e); } } else { // Already loaded, put in again try { Class customTextExtractor = customExtractors.get(customText); ITextExtractor obj = (ITextExtractor) customTextExtractor.newInstance(); text_extractor_mappings.put(obj.getName().toLowerCase(), obj); } catch (Exception e) { logger.error("ITextExtractor: Couldn't use already loaded " + customText + ": " + e.getMessage(), e); } catch (NoClassDefFoundError e) { logger.error("ITextExtractor: Couldn't use already loaded " + customText + ": " + e.getMessage(), e); } } } } //TESTED // Entity extractors String customEntityList = props.getCustomEntityExtractors(); if (null != customEntityList) { String customEntityArray[] = customEntityList.split("\\s*,\\s*"); for (String customEntity : customEntityArray) { if (!customExtractors.containsKey(customEntity)) { // (else already have this extractor - but may have it for text, so some work to do) try { Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity); IEntityExtractor obj = (IEntityExtractor) customEntityExtractor.newInstance(); entity_extractor_mappings.put(obj.getName().toLowerCase(), obj); customExtractors.put(customEntity, customEntityExtractor); } catch (Exception e) { logger.error("IEntityExtractor: Couldn't load " + customEntity + ": " + e.getMessage(), e); } catch (NoClassDefFoundError e) { logger.error("IEntityExtractor: Couldn't load " + customEntity + ": " + e.getMessage(), e); } } else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor try { Class customEntityExtractor = customExtractors.get(customEntity); IEntityExtractor obj = (IEntityExtractor) customEntityExtractor.newInstance(); entity_extractor_mappings.put(obj.getName(), obj); } catch (Exception e) { logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity + ": " + e.getMessage(), e); } catch (NoClassDefFoundError e) { logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity + ": " + e.getMessage(), e); } } } } //TESTED } try { entity_extractor_mappings.put("opencalais", new ExtractorOpenCalais()); } catch (Exception e) { logger.warn("Can't use OpenCalais as entity extractor: " + e.getMessage()); } try { entity_extractor_mappings.put("textrank", new TextRankExtractor()); } catch (Exception e) { logger.warn("Can't use textrank as entity extractor: " + e.getMessage()); } try { ExtractorAlchemyAPI both = new ExtractorAlchemyAPI(); entity_extractor_mappings.put("alchemyapi", both); text_extractor_mappings.put("alchemyapi", both); ExtractorAlchemyAPI_Metadata both_metadata = new ExtractorAlchemyAPI_Metadata(); entity_extractor_mappings.put("alchemyapi-metadata", both_metadata); text_extractor_mappings.put("alchemyapi-metadata", both_metadata); } catch (Exception e) { logger.warn("Can't use AlchemyAPI as entity/text extractor: " + e.getMessage()); } try { text_extractor_mappings.put("externalscript", new TextExtractorExternalScript()); } catch (Exception e) { logger.warn("Can't use ExternalScript as text extractor: " + e.getMessage()); } try { text_extractor_mappings.put("boilerpipe", new TextExtractorBoilerpipe()); } catch (Exception e) { logger.warn("Can't use Boilerpipe as text extractor: " + e.getMessage()); } try { text_extractor_mappings.put("tika", new TextExtractorTika()); } catch (Exception e) { logger.warn("Can't use Tika as text extractor: " + e.getMessage()); } if (null != pm.getDefaultEntityExtractor()) { default_entity_extractor = entity_extractor_mappings.get(pm.getDefaultEntityExtractor().toLowerCase()); } else { default_entity_extractor = null; } if (null != pm.getDefaultTextExtractor()) { default_text_extractor = text_extractor_mappings.get(pm.getDefaultTextExtractor().toLowerCase()); } else { try { default_text_extractor = new TextExtractorBoilerpipe(); } catch (Exception e) { logger.warn("Can't use BoilerPlate as default text extractor: " + e.getMessage()); } } nBetweenFeedDocs_ms = props.getWebCrawlWaitTime(); // Set up security manager - basically always needed so might as well create here _securityManager = new IkanowSecurityManager(); } /** * Handles going through what to do with a source for harvesting * The process currently is: * 1. Extract from source * 2. Enrich with metadata from toAdd (entity, fulltext, events, etc) * * @param source The source to harvest */ public void harvestSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) { nUrlErrorsThisSource = 0; if (HarvestController.isHarvestKilled()) { // Already spent too long - just bail out from here source.setReachedMaxDocs(); return; } // New Harvest Pipeline logic if (null != source.getProcessingPipeline()) { if (null == procPipeline) { procPipeline = new HarvestControllerPipeline(); } procPipeline.extractSource_preProcessingPipeline(source, this); //(just copy the config into the legacy source fields since the // actual processing is the same in both cases) } //TESTED // Can override the default (feed) wait time from within the source (eg for sites that we know // don't get upset about getting hammered) if (null != source.getRssConfig()) { if (null != source.getRssConfig().getWaitTimeOverride_ms()) { nBetweenFeedDocs_ms = source.getRssConfig().getWaitTimeOverride_ms(); } } LinkedList<DocumentPojo> toDuplicate = new LinkedList<DocumentPojo>(); // Reset any state that might have been generated from the previous source getDuplicateManager().resetForNewSource(); getHarvestStatus().resetForNewSource(); // (temp location to store timings) source.setCreated(new Date()); //First up, Source Extraction (could spawn off some threads to do source extraction) // Updates will be treated as follows: // - extract etc etc (since they have changed) // [and then in generic processing // - remove them (including their child objects, eg events) ... // ... - but retain "created" date (and in the future artefacts like comments)] extractSource(source, toAdd, toUpdate, toRemove, toDuplicate); // (^^^ this adds toUpdate to toAdd) // (temp location to store timings) source.setModified(new Date()); if (null != source.getProcessingPipeline()) { procPipeline.setInterDocDelayTime(nBetweenFeedDocs_ms); try { procPipeline.enrichSource_processingPipeline(source, toAdd, toUpdate, toRemove); } finally { // (ensure can clear memory) procPipeline.clearState(); } } else { // Old logic (more complex, less functional) enrichSource(source, toAdd, toUpdate, toRemove); } completeEnrichmentProcess(source, toAdd, toUpdate, toRemove); // (Now we've completed enrichment either normally or by cloning, add the dups back to the normal documents for generic processing) LinkedList<DocumentPojo> groupedDups = new LinkedList<DocumentPojo>(); // (ie clones) DocumentPojo masterDoc = null; // (just looking for simple pointer matching here) for (DocumentPojo dupDoc : toDuplicate) { if (null == dupDoc.getCloneFrom()) { toAdd.add(dupDoc); } else if (null != dupDoc.getCloneFrom().getTempSource()) { //(Else doc was removed from toAdd list due to extraction errors) if (null == masterDoc) { // First time through masterDoc = dupDoc.getCloneFrom(); } else if (!masterDoc.getUrl().equals(dupDoc.getUrl())) { // New group! groupedDups = enrichDocByCloning(groupedDups); if (null != groupedDups) { toAdd.addAll(groupedDups); groupedDups.clear(); } else { groupedDups = new LinkedList<DocumentPojo>(); } masterDoc = dupDoc.getCloneFrom(); } groupedDups.add(dupDoc); } } //end loop over duplicates //TESTED, included case where the master doc errors during extraction (by good fortune!) if (null != groupedDups) { // (Leftover group) groupedDups = enrichDocByCloning(groupedDups); if (null != groupedDups) { toAdd.addAll(groupedDups); } } //TESTED (as above) } /** * Figures out what source extractors to use and then fills the toAdd list * with DocumentPojo objects from the extractors. * * @param flags The source extractors to use * @param start source to start extracting at * @param end source to stop extracting at * @param toAdd A reference to the toAdd that should be filled with what the source extracts */ @SuppressWarnings("unchecked") private void extractSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove, List<DocumentPojo> toDup) { boolean normalCase = true; normalCase = (1 == source.getCommunityIds().size()) || // (normal case..) ((2 == source.getCommunityIds().size()) && source.getCommunityIds().contains(source.getOwnerId())); // (test case..) //determine which source extractor to use for (HarvesterInterface harvester : harvesters) { if (harvester.canHarvestType(InfiniteEnums.castExtractType(source.getExtractType()))) { try { List<DocumentPojo> tmpToAdd = new LinkedList<DocumentPojo>(); List<DocumentPojo> tmpToUpdate = new LinkedList<DocumentPojo>(); List<DocumentPojo> tmpToRemove = new LinkedList<DocumentPojo>(); harvester = harvester.getClass().newInstance(); // (create a new harvester for each source, avoids problems with state...) harvester.executeHarvest(this, source, tmpToAdd, tmpToUpdate, tmpToRemove); int nDocs = 0; for (List<DocumentPojo> docList : Arrays.asList(tmpToAdd, tmpToUpdate)) { for (DocumentPojo doc : docList) { if (++nDocs > _nMaxDocs) { break; } // Handle cloning on "duplicate docs" from different sources boolean bDuplicated = false; if (null != doc.getDuplicateFrom() && (null == doc.getUpdateId())) { DocumentPojo newDoc = enrichDocByDuplicating(doc); // (Note this is compatible with the cloning case whose logic is below: // this document gets fully populated here then added to dup list (with dupFrom==null), with a set of slaves // with dupFrom==sourceKey. When the dup list is traversed (after bypassing enrichment), the slaves are // then created from this master) if (null != newDoc) { doc = newDoc; bDuplicated = true; } } else { // if the update id is non-null then ignore the above logic doc.setDuplicateFrom(null); } // Copy over material from source pojo: doc.setSource(source.getTitle()); doc.setTempSource(source); doc.setMediaType(source.getMediaType()); if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) { if (null != source.getTags()) { doc.setTags(new HashSet<String>(source.getTags())); } } ObjectId sCommunityId = source.getCommunityIds().iterator().next(); // (multiple communities handled below) String sIndex = new StringBuffer("doc_").append(sCommunityId.toString()).toString(); doc.setCommunityId(sCommunityId); doc.setIndex(sIndex); if (normalCase) { // Normal case (or test case) doc.setSourceKey(source.getKey()); } else { // Many communities for a single source, not a pleasant case String sMasterDocSourceKey = null; for (ObjectId id : source.getCommunityIds()) { if (null == sMasterDocSourceKey) { sMasterDocSourceKey = (source.getKey()); doc.setSourceKey(sMasterDocSourceKey); } else { // Will defer these until after the master doc has been added to the database DocumentPojo cloneDoc = new DocumentPojo(); // Will need these fields cloneDoc.setIndex(new StringBuffer("doc_").append(id).toString()); cloneDoc.setCommunityId(id); cloneDoc.setSourceKey(source.getKey()); cloneDoc.setSource(source.getTitle()); cloneDoc.setUrl(doc.getUrl()); if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) { cloneDoc.setTags(new HashSet<String>(source.getTags())); } cloneDoc.setCloneFrom(doc); toDup.add(cloneDoc); } } //TESTED (both in clone and clone+duplicate) } // Normally add to enrichment list (for duplicates, bypass this) if (bDuplicated) { toDup.add(doc); // (Already enriched by duplication process) } else { toAdd.add(doc); } } } //(end loop over docs to add/update) num_docs_extracted.addAndGet(tmpToAdd.size() > _nMaxDocs ? _nMaxDocs : tmpToAdd.size()); toUpdate.addAll(tmpToUpdate); toRemove.addAll(tmpToRemove); } catch (Exception e) { //DEBUG //e.printStackTrace(); String reason = Globals.populateStackTrace(new StringBuffer(), e).toString(); logger.error("Error extracting source=" + source.getKey() + ", type=" + source.getExtractType() + ", reason=" + reason); _harvestStatus.update(source, new Date(), HarvestEnum.error, "Extraction error: " + reason, false, false); } break; //exit for loop, source is extracted } } } // // (LEGACY) Gets metadata using the extractors and appends to documents // private void enrichSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) { StructuredAnalysisHarvester sah = null; UnstructuredAnalysisHarvester usah = null; // Create metadata from the text using regex (also calculate header/footer information if desired) if (source.getUnstructuredAnalysisConfig() != null) { usah = new UnstructuredAnalysisHarvester(); // If performing structured analysis also then need to mux them // since the UAH will run on the body/description potentially created by the SAH // and the SAH will take the metadata generated by UAH to create entities and events if (source.getStructuredAnalysisConfig() != null) { sah = new StructuredAnalysisHarvester(); sah.addUnstructuredHandler(usah); } else { toAdd = usah.executeHarvest(this, source, toAdd); } } // For sources that generate structured data, we can turn that into entities and events // and fill in document fields from the metadata (that can be used by entity extraction) if (source.getStructuredAnalysisConfig() != null) { if (null == sah) { sah = new StructuredAnalysisHarvester(); } toAdd = sah.executeHarvest(this, source, toAdd); // (if usah exists then this runs usah) } // Perform text and entity extraction if (source.getStructuredAnalysisConfig() == null) // (Else is performed during SAH above) { if (isEntityExtractionRequired(source)) { // Text/Entity Extraction try { extractTextAndEntities(toAdd, source, false, false); } catch (Exception e) { handleExtractError(e, source); //handle extractor error if need be } } } // (end if no SAH) // Finish processing: // Complete batches if (isEntityExtractionRequired(source)) { try { extractTextAndEntities(null, source, true, false); } catch (Exception e) { } } } private void completeEnrichmentProcess(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) { // Map ontologies: completeDocumentBuilding(toAdd, toUpdate); int pxErrors = getHarvestStatus().getNumMessages(); num_error_px.addAndGet(pxErrors); // Log the number of feeds extracted for the current source if ((toAdd.size() > 0) || (toUpdate.size() > 0) || (toRemove.size() > 0) || (nUrlErrorsThisSource > 0) || (pxErrors > 0)) { StringBuffer sLog = new StringBuffer("source=") .append((null == source.getUrl() ? source.getKey() : source.getUrl())).append(" "); // (only need this for the log, not the source harvest message) if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message() && !source.getHarvestStatus().getHarvest_message().isEmpty())) { String message = source.getHarvestStatus().getHarvest_message().replace("\n", " "); if (message.length() > 512) { sLog.append("extracterr='").append(message.substring(0, 512)).append("...' "); } else { sLog.append("extracterr='").append(message).append("' "); } } //TESTED StringBuffer sLog2 = new StringBuffer(); long extractTime_ms = source.getModified().getTime() - source.getCreated().getTime(); long enrichTime_ms = new Date().getTime() - source.getModified().getTime(); // Extraction stats: sLog2.append("extracted=").append(toAdd.size()).append(" updated=").append(toUpdate.size()) .append(" deleted=").append(toRemove.size()).append(" extract_time_ms=").append(extractTime_ms) .append(" enrich_time_ms=").append(enrichTime_ms).append(" urlerrors=") .append(nUrlErrorsThisSource).append(" pxerrors=").append(pxErrors); getHarvestStatus().logMessage(sLog2.toString(), false); sLog.append(sLog2); // Other error info for the log only: String mostCommonMessage = getHarvestStatus().getMostCommonMessage(); if (null != mostCommonMessage) { if (mostCommonMessage.length() > 256) { mostCommonMessage = mostCommonMessage.substring(0, 253) + "...'"; } sLog.append(mostCommonMessage); // (don't need this in the harvest status since we already have all of them) } logger.info(sLog.toString()); } //TESTED // May need to update status again (eg any extractor errors or successes - in the harvesters or immediately above): if (getHarvestStatus().moreToLog()) { getHarvestStatus().update(source, new Date(), source.getHarvestStatus().getHarvest_status(), "", false, false); } // (note: the harvest status is updated 3 times: // 1) inside the source-type harvester (which: 1.1) resets the message 1.2) wipes the messages, but sets prevStatus.getHarvest_message() above) // 2) above (the update call, which occurs if logMessage() has been called at any point) // 3) after store/index manager, which normally just sets the status unless any errors occurred during indexing num_sources_harvested.incrementAndGet(); } // Quick utility to return if entity extraction has been specified by the user public boolean isEntityExtractionRequired(SourcePojo source) { return (((null == source.useExtractor()) && (null != default_entity_extractor)) || ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none"))) || (((null == source.useTextExtractor()) && (null != default_text_extractor)) || ((null != source.useTextExtractor()) && !source.useTextExtractor().equalsIgnoreCase("none"))); } /** * Takes a list of toAdd and extracts each ones full text and entities/events/sentiment (metadata) * * @param toAdd The list of toAdd without metadata to extract on * @return Any errors that occured while extracting, null if no error * @throws ExtractorSourceLevelTransientException */ public void extractTextAndEntities(List<DocumentPojo> toAdd, SourcePojo source, boolean bFinalizeBatchOnly, boolean calledFromPipeline) throws ExtractorDocumentLevelException, ExtractorSourceLevelException, ExtractorDailyLimitExceededException, ExtractorSourceLevelMajorException, ExtractorSourceLevelTransientException { IEntityExtractor currentEntityExtractor = null; try { int error_on_feed_count = 0, feed_count = 0; // EXTRACTOR SELECTION LOGIC if (null != source.useExtractor()) { currentEntityExtractor = entity_extractor_mappings.get(source.useExtractor().toLowerCase()); if (null == currentEntityExtractor) { // (second chance) currentEntityExtractor = (IEntityExtractor) lookForDynamicExtractor(source, false); } } if (currentEntityExtractor == null) // none specified or didn't find it (<-latter is error) { if ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")) { // ie specified one but it doesn't exist.... StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()) .append(" no_extractor=").append(source.useExtractor()); logger.warn(errMsg.toString()); // No point trying this for the rest of the day throw new ExtractorSourceLevelException(errMsg.toString()); } else if (null == source.useExtractor()) { // Didn't specify one, just use default: currentEntityExtractor = default_entity_extractor; } } //TESTED if (bFinalizeBatchOnly) { try { currentEntityExtractor.extractEntities(null); } catch (Exception e) { } // do nothing, eg handle entity extractors that don't handle things well return; } // A teeny bit of complex logic: // toAdd by default use a text extractor // DB/Files by default don't (but can override) ITextExtractor currentTextExtractor = null; boolean bUseRawContentWhereAvailable = false; // (only applies for feeds) if (null != source.useTextExtractor()) { currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase()); if (null == currentTextExtractor) { // (second chance) currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true); } } if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error) if (null != source.useTextExtractor()) { if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig()) && (null == source.getProcessingPipeline())) { //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline) StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()) .append(" no_txt_extractor=").append(source.useTextExtractor()); logger.warn(errMsg.toString()); // No point trying this for the rest of the day throw new ExtractorSourceLevelException(errMsg.toString()); } else { bUseRawContentWhereAvailable = true; // (only checked for feeds) } //TESTED } else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText) { if (null != currentEntityExtractor) { String selfExtraction = currentEntityExtractor .getCapability(EntityExtractorEnum.URLTextExtraction); // Leave as null unless have no built-in capability if ((null == selfExtraction) || !selfExtraction.equals("true")) { currentTextExtractor = default_text_extractor; } } else { currentTextExtractor = default_text_extractor; } } //TESTED } // EXTRACTION Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be // removed within the loop while (i.hasNext()) { long nTime_ms = System.currentTimeMillis(); DocumentPojo doc = i.next(); boolean bExtractedText = false; // If I've been stopped then just remove all remaining documents // (pick them up next time through) if (bIsKilled) { i.remove(); if (!calledFromPipeline) { doc.setTempSource(null); // (can safely corrupt this doc since it's been removed) } continue; } if (calledFromPipeline || !urlsThatError.contains(doc.getUrl())) //only attempt if url is okay { feed_count++; try { // (Check for truncation) if ((null != currentEntityExtractor) && (null != doc.getFullText())) { try { String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes); if (null != s) { int maxLength = Integer.parseInt(s); if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out getHarvestStatus().logMessage( "Warning: truncating document to max length: " + s, false); } } } catch (Exception e) { } // max length not reported just carry on } if (null != currentTextExtractor) { bExtractedText = true; currentTextExtractor.extractText(doc); if (null != currentEntityExtractor) { currentEntityExtractor.extractEntities(doc); } } //TESTED else //db/filesys should already have full text extracted (unless otherwise specified) { if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) { bExtractedText = true; if (null != currentEntityExtractor) { currentEntityExtractor.extractEntitiesAndText(doc); } } //TESTED (AlchemyAPI case) else { // Feed for which we've already extracted data if (null != currentEntityExtractor) { currentEntityExtractor.extractEntities(doc); } } //TESTED } else { // DB/File => use full text if (null != currentEntityExtractor) { currentEntityExtractor.extractEntities(doc); } } //TESTED } //statistics counting if (doc.getEntities() != null) num_ent_extracted.addAndGet(doc.getEntities().size()); if (doc.getAssociations() != null) num_event_extracted.addAndGet(doc.getAssociations().size()); } catch (ExtractorDailyLimitExceededException e) { //extractor can't do anything else today, return i.remove(); if (!calledFromPipeline) { doc.setTempSource(null); // (can safely corrupt this doc since it's been removed) } // Source error, ignore all other documents while (i.hasNext()) { doc = i.next(); if (!calledFromPipeline) { doc.setTempSource(null); // (can safely corrupt this doc since it's been removed) } i.remove(); } //TESTED throw e; // (ie stop processing this source) } //TESTED catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException //TODO (INF-1922): put this in a separate function and call that from pipeline on failure... // (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?) // This can come from (sort-of/increasingly) "user" code so provide a bit more information StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e); _harvestStatus.logMessage(errMessage.toString(), true); num_error_url.incrementAndGet(); nUrlErrorsThisSource++; if (!calledFromPipeline) { urlsThatError.add(doc.getUrl()); } error_on_feed_count++; i.remove(); if (!calledFromPipeline) { doc.setTempSource(null); // (can safely corrupt this doc since it's been removed) } } //TESTED } // (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline) if ((null != source.getExtractType()) && (source.getExtractType().equalsIgnoreCase("feed"))) { if (i.hasNext() && bExtractedText) { nTime_ms = nBetweenFeedDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time) if (nTime_ms > 0) { try { Thread.sleep(nTime_ms); } catch (Exception e) { } ; // (wait 10s between web-site accesses for politeness) } } } //(TESTED) } // end loop over documents //check if all toAdd were erroring, or more than 20 (arbitrary number) //NOTE: this is duplicated in HarvestControllerPipeline for non-legacy cases if ((error_on_feed_count == feed_count) && (feed_count > 5)) { String errorMsg = new StringBuffer().append(feed_count).append(" docs, ") .append(error_on_feed_count).append(", errors").toString(); if (error_on_feed_count > 20) { throw new ExtractorSourceLevelMajorException(errorMsg); } else { throw new ExtractorSourceLevelException(errorMsg); } //TESTED } } catch (ExtractorDailyLimitExceededException e) { // Percolate upwards! throw e; } catch (ExtractorSourceLevelException e) { // Percolate upwards! throw e; } catch (ExtractorSourceLevelMajorException e) { // Percolate upwards! throw e; } catch (Exception e) { // Misc internal error StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" error=") .append(e.getMessage()); logger.error(errMsg.toString(), e); throw new ExtractorSourceLevelTransientException(errMsg.toString()); } //TESTED }//TESTED ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // UTILITY FUNCTIONS /** * Decides what to do with a source when an error is returned from the * extractor process. * * @param error The error that was returned from extractor * @param source The source that the extractor was working on */ public void handleExtractError(Exception error, SourcePojo source) { if (null != error) { if (error instanceof ExtractorDocumentLevelException) { num_error_url.incrementAndGet(); nUrlErrorsThisSource++; } else if (error instanceof ExtractorSourceLevelException) { num_errors_source.incrementAndGet(); //We flag the source in mongo and temp disable _harvestStatus.update(source, new Date(), HarvestEnum.error, "Source Level extraction error: " + error.getMessage(), true, false); } //TESTED else if (error instanceof ExtractorSourceLevelMajorException) { num_errors_source.incrementAndGet(); //We flag the source in mongo and perma disable _harvestStatus.update(source, new Date(), HarvestEnum.error, "Major source level Extraction error: " + error.getMessage(), true, true); } //TESTED else if (error instanceof ExtractorSourceLevelTransientException) { num_errors_source.incrementAndGet(); //We flag the source in mongo _harvestStatus.update(source, new Date(), HarvestEnum.error, "Transient source level extraction error: " + error.getMessage(), false, false); } //TESTED else if (error instanceof ExtractorDailyLimitExceededException) { //We flag the source in mongo and temp disable _harvestStatus.update(source, new Date(), HarvestEnum.success, "Extractor daily limit error.", true, false); } //TESTED } }//TESTED (just that the instanceofs work) /** * Prints out some quick info about how the harvester performed */ public static void logHarvesterStats() { StringBuilder sb = new StringBuilder(); sb.append("num_of_sources_harvested=" + num_sources_harvested.get()); sb.append(" num_of_docs_extracted=" + num_docs_extracted.get()); sb.append(" num_of_entities_extracted=" + num_ent_extracted.get()); sb.append(" num_of_events_extracted=" + num_event_extracted.get()); sb.append(" num_of_source_errors=" + num_errors_source.get()); sb.append(" num_of_url_errors=" + num_error_url.get()); sb.append(" num_of_px_errors=" + num_error_px.get()); logger.info(sb.toString()); } // Utility to handle the various multiple community problems: // - Different sources, name URL ("duplicates") ... get the doc from the DB (it's there by definition) // - Same source, multiple communities ("clones") ... get the doc from the first community processed private static DocumentPojo enrichDocByDuplicating(DocumentPojo docToReplace) { DocumentPojo newDoc = null; BasicDBObject dbo = getDocumentMetadataFromWhichToDuplicate(docToReplace); if (null != dbo) { String sContent = getDocumentContentFromWhichToDuplicate(docToReplace); if (null != sContent) { newDoc = duplicateDocument(docToReplace, dbo, sContent, false); // (Note this erases the "duplicateFrom" field - this is important because it distinguishes "clones" and "duplicates") } } return newDoc; }//TESTED private static LinkedList<DocumentPojo> enrichDocByCloning(List<DocumentPojo> docsToReplace) { DocumentPojo newDoc = null; BasicDBObject dbo = null; String sContent = null; LinkedList<DocumentPojo> newDocs = new LinkedList<DocumentPojo>(); for (DocumentPojo docToReplace : docsToReplace) { if (null == dbo) { // First time through... sContent = docToReplace.getCloneFrom().getFullText(); docToReplace.getCloneFrom().setFullText(null); dbo = (BasicDBObject) docToReplace.getCloneFrom().toDb(); docToReplace.getCloneFrom().setFullText(sContent); } newDoc = duplicateDocument(docToReplace, dbo, sContent, true); newDocs.add(newDoc); } return newDocs; }//TESTED // Sub-utility private static BasicDBObject getDocumentMetadataFromWhichToDuplicate(DocumentPojo docToReplace) { BasicDBObject query = new BasicDBObject("url", docToReplace.getUrl()); query.put("sourceKey", docToReplace.getDuplicateFrom()); BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query); return dbo; }//TESTED private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) { try { // Get the full text: byte[] storageArray = new byte[200000]; BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl()); contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey()))); BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1); BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ, fields); if (null != dboContent) { byte[] compressedData = ((byte[]) dboContent.get(CompressedFullTextPojo.gzip_content_)); ByteArrayInputStream in = new ByteArrayInputStream(compressedData); GZIPInputStream gzip = new GZIPInputStream(in); int nRead = 0; StringBuffer output = new StringBuffer(); while (nRead >= 0) { nRead = gzip.read(storageArray, 0, 200000); if (nRead > 0) { String s = new String(storageArray, 0, nRead, "UTF-8"); output.append(s); } } return output.toString(); } else { // Will just need to-reprocess this document return null; } } catch (Exception e) { // Do nothing, just carry on e.printStackTrace(); } return null; }//TESTED private static DocumentPojo duplicateDocument(DocumentPojo docToReplace, BasicDBObject dbo, String content, boolean bClone) { DocumentPojo newDoc = DocumentPojo.fromDb(dbo, DocumentPojo.class); newDoc.setFullText(content); newDoc.setId(null); // (ie ensure it's unique) if (bClone) { // Cloned docs have special source key formats (and also need to update their community) ObjectId docCommunity = docToReplace.getCommunityId(); newDoc.setSourceKey(docToReplace.getSourceKey()); newDoc.setCommunityId(docCommunity); newDoc.setIndex(new StringBuffer("doc_").append(docCommunity).toString()); } else { // For cloned documents, published etc can be taken from the master document, ie newDoc is already accurate // Copy over timing details from new document (set by the harvesters) newDoc.setPublishedDate(docToReplace.getPublishedDate()); newDoc.setCreated(docToReplace.getCreated()); newDoc.setModified(docToReplace.getModified()); } return newDoc; }//TESTED // // Any documents that have got this far are going to get processed // // Processing: //Attempt to map entity types to set of ontology types //eventually the plan is to allow extractors to set the ontology_type of //entities to anything found in the opencyc ontology static public void completeDocumentBuilding(List<DocumentPojo> docs, List<DocumentPojo> updateDocs) { // Handle documents to be added // Currently, just set ontology type if (docs != null) { for (DocumentPojo doc : docs) { if (doc.getEntities() != null) { num_ent_extracted.addAndGet(doc.getEntities().size()); for (EntityPojo entity : doc.getEntities()) { if (entity.getGeotag() != null) { if (null == entity.getOntology_type()) { entity.setOntology_type(GeoOntologyMapping.mapEntityToOntology(entity.getType())); } } } } if (doc.getAssociations() != null) { num_event_extracted.addAndGet(doc.getAssociations().size()); } } } // Remove any docs from update list that didn't get updated if (updateDocs != null) { Iterator<DocumentPojo> it = updateDocs.iterator(); while (it.hasNext()) { DocumentPojo d = it.next(); if (null == d.getTempSource()) { //this doc got rejected, normally it means we'll remove from update list so the db version is left alone if (null == d.getExplain()) { // exception: if d.getExplain != null then _remove_ from update list, so db version is deleted it.remove(); } //TODO (INF-2825): TOTEST } } } } /////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////// // Dynamic extraction utilities private synchronized Object lookForDynamicExtractor(SourcePojo source, boolean bTextExtractor) { String extractorName = bTextExtractor ? source.useTextExtractor() : source.useExtractor(); if (null == extractorName) { return null; } Object outClassInstance = null; if (null != failedDynamicExtractors) { // (cache for failed shares) if (failedDynamicExtractors.contains(extractorName)) { return null; } } ClassLoader savedClassLoader = null; try { ObjectId extractorId = null; if (extractorName.startsWith("/")) { // allow /<id>/free text.. extractorName = extractorName.substring(1).replaceFirst("/.*", ""); } //TESTED try { extractorId = new ObjectId(extractorName); } catch (Exception e) { // not a dynamic share that's fine, just exit no harm done return null; } // If we're here then it was a share BasicDBObject query = new BasicDBObject("_id", extractorId); SharePojo extractorInfo = SharePojo.fromDb(MongoDbManager.getSocial().getShare().findOne(query), SharePojo.class); if ((null != extractorInfo) && (null != extractorInfo.getBinaryId())) { // Check share owned by an admin: if (!AuthUtils.isAdmin(extractorInfo.getOwner().get_id())) { throw new RuntimeException("Extractor share owner must be admin"); } //TESTED // Check >0 source communities are in the share communities int nMatches = 0; for (ShareCommunityPojo commObj : extractorInfo.getCommunities()) { if (source.getCommunityIds().contains(commObj.get_id())) { nMatches++; break; } } if (0 == nMatches) { throw new RuntimeException("Extractor not shared across source communities"); } //TESTED savedClassLoader = Thread.currentThread().getContextClassLoader(); //HashMap<String, Class<?> > dynamicExtractorClassCache = null; if (null == dynamicExtractorClassCache) { dynamicExtractorClassCache = new HashMap<String, Class<?>>(); } URL[] cachedJarFile = { new File(maintainJarFileCache(extractorInfo)).toURI().toURL() }; Class<?> classToLoad = dynamicExtractorClassCache.get(extractorInfo.getTitle()); if (null == classToLoad) { URLClassLoader child = new URLClassLoader(cachedJarFile, savedClassLoader); Thread.currentThread().setContextClassLoader(child); classToLoad = Class.forName(extractorInfo.getTitle(), true, child); dynamicExtractorClassCache.put(extractorInfo.getTitle(), classToLoad); } if (bTextExtractor) { ITextExtractor txtExtractor = (ITextExtractor) classToLoad.newInstance(); text_extractor_mappings.put(source.useTextExtractor(), txtExtractor); outClassInstance = txtExtractor; } else { IEntityExtractor entExtractor = (IEntityExtractor) classToLoad.newInstance(); entity_extractor_mappings.put(source.useExtractor(), entExtractor); outClassInstance = entExtractor; } } } catch (Exception e) { StringBuffer sb = Globals.populateStackTrace(new StringBuffer(), e); sb.append( " (check the share's title is the fully qualified classname, and its community permissions are correct)"); getHarvestStatus().logMessage("custom extractor error: " + sb.toString(), false); if (null == failedDynamicExtractors) { failedDynamicExtractors = new HashSet<String>(); failedDynamicExtractors.add(extractorName); } //e.printStackTrace(); } // General fail just carry on catch (Error err) { StringBuffer sb = Globals.populateStackTrace(new StringBuffer(), err); sb.append( " (check the share's title is the fully qualified classname, and its community permissions are correct)"); getHarvestStatus().logMessage("custom extractor error: " + sb.toString(), false); if (null == failedDynamicExtractors) { failedDynamicExtractors = new HashSet<String>(); failedDynamicExtractors.add(extractorName); } //err.printStackTrace(); } // General fail just carry on finally { if (null != savedClassLoader) { Thread.currentThread().setContextClassLoader(savedClassLoader); } } return outClassInstance; }//TOTEST /** * Finds the gridfile given by id and returns the bytes * * @param id the object id of the gridfile to lookup (stored in sharepojo) * @return bytes of file stored in gridfile */ // private static byte[] getGridFile(ObjectId id) // { // ByteArrayOutputStream out = new ByteArrayOutputStream(); // try // { // GridFSDBFile file = DbManager.getSocial().getShareBinary().find(id); // file.writeTo(out); // byte[] toReturn = out.toByteArray(); // out.close(); // return toReturn; // } // catch (Exception ex){} // return null; // } /** * Downloads jar file from web using URL call. Typically * the jar files we be kept in our /share store so we will * be calling our own api. * * @param jarURL * @return * @throws Exception */ public static String maintainJarFileCache(SharePojo share) throws Exception { String tempFileName = System.getProperty("java.io.tmpdir") + "/" + share.get_id() + ".cache.jar"; File tempFile = new File(tempFileName); // Compare dates (if it exists) to see if we need to update the cache) if (!tempFile.exists() || (tempFile.lastModified() < share.getModified().getTime())) { OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFileName)); if (share.getBinaryId() != null) { GridFSDBFile file = DbManager.getSocial().getShareBinary().find(share.getBinaryId()); file.writeTo(out); } else { out.write(share.getBinaryData()); } out.flush(); out.close(); } //TESTED return tempFileName; } }