Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.harvest.extraction.document.file; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.UnknownHostException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.Date; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import javax.xml.stream.FactoryConfigurationError; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.transform.OutputKeys; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import jcifs.smb.NtlmPasswordAuthentication; import jcifs.smb.SmbException; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.txt.TXTParser; import org.bson.types.ObjectId; import org.xml.sax.ContentHandler; import org.apache.commons.codec.digest.DigestUtils; import com.google.gson.JsonElement; import com.google.gson.JsonParser; import com.google.gson.stream.JsonReader; import com.ikanow.infinit.e.data_model.InfiniteEnums; import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelMajorException; import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum; import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo; import com.ikanow.infinit.e.data_model.store.config.source.SourceFileConfigPojo.StreamingType; import com.ikanow.infinit.e.data_model.store.config.source.SourcePipelinePojo; import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo; import com.ikanow.infinit.e.data_model.store.document.DocumentPojo; import com.ikanow.infinit.e.data_model.store.document.GeoPojo; import com.ikanow.infinit.e.harvest.HarvestContext; import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface; import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager; import com.ikanow.infinit.e.harvest.utils.AuthUtils; import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils; import com.ikanow.infinit.e.harvest.utils.PropertiesManager; public class FileHarvester implements HarvesterInterface { @SuppressWarnings("unused") private static final byte[] SP = " ".getBytes(); private int maxDepth; private Set<Integer> sourceTypesCanHarvest = new HashSet<Integer>(); private int maxDocsPerCycle = Integer.MAX_VALUE; @SuppressWarnings("unused") private static final String TYPES[] = { "TYPE_COMM", "TYPE_FILESYSTEM", "TYPE_NAMED_PIPE", "TYPE_PRINTER", "TYPE_SERVER", "TYPE_SHARE", "TYPE_WORKGROUP" }; private int errors = 0; // List of Feeds private List<DocumentPojo> files = null; private List<DocumentPojo> docsToAdd = null; private List<DocumentPojo> docsToUpdate = null; private List<DocumentPojo> docsToRemove = null; private boolean _deleteExistingFilesBySourceKey = false; private HashSet<String> sourceUrlsGettingUpdated = null; // (tells us source URLs that are being deleted) private HarvestContext _context; // Some internal state private boolean _streaming = false; // (new mode, currently unused) private boolean _customJob = false; // (some logic is different) private boolean _unshardedCustomJob = false; // (mostly just a safety case for when the custom job incorrectly doesn't shard) private Date _customLastRecordWritten = null; private boolean _bNeedsExtraSyncLock = false; // (handles support for broken jcifs INF-1406, should see if that's been fixed yet..) // Formatting office docs: allows HTML/XML output and to push options from the parsers into the tika instance private Tika _tika = null; ContentHandler _tikaOutputFormat = null; StringWriter _tikaXmlFormatWriter; ParseContext _tikaOutputParseContext = null; // Can specify regexes to select which files to ignore private Pattern includeRegex = null; // files only private Pattern excludeRegex = null; // files and paths // Security: private boolean harvestSecureMode = false; // Try to avoid blowing up the memory: private long _memUsage = 0; private static AtomicLong _totalMemUsage = new AtomicLong(0L); private static ThreadLocal<Long> _lastMemUsage = new ThreadLocal<Long>(); /** * Get a specific doc to return the bytes for * @throws Exception */ public static byte[] getFile(String fileURL, SourcePojo source) throws Exception { InputStream in = null; try { InfiniteFile searchFile = searchFileShare(source, fileURL); if (searchFile == null) return null; else { //found the file, return the bytes in = searchFile.getInputStream(); if (null == in) return null; ByteArrayOutputStream buffer = new ByteArrayOutputStream(); int read; byte[] data = new byte[16384]; while ((read = in.read(data, 0, data.length)) != -1) { buffer.write(data, 0, read); } buffer.flush(); return buffer.toByteArray(); } } catch (Exception e) { throw e; } finally { if (null != in) { in.close(); } } } /** * Same as the traverse method but returns the InfiniteFile if it finds searchFile * returns null otherwise * * @param f * @param source * @param depth * @param searchFile * @return * @throws SmbException */ private static InfiniteFile searchFileShare(SourcePojo source, String searchFile) throws Exception { // Made this synchronized to work around what looks like deadlock issue in code // This is undesirable and should be fixed once the underlying bug has been fixed // (note in practice this is only an issue for multiple threads going to the same domain) InfiniteFile f; synchronized (FileHarvester.class) { try { if (null != source.getProcessingPipeline()) { // new style... SourcePipelinePojo firstElement = source.getProcessingPipeline().iterator().next(); source.setFileConfig(firstElement.file); source.setUrl(firstElement.file.getUrl()); } //TESTED if (source.getUrl().startsWith("inf://")) { // Infinit.e share/custom object NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication( source.getCommunityIds().iterator().next().toString(), source.getOwnerId().toString(), null); f = InfiniteFile.create(source.getUrl(), auth); if (f.isDirectory()) { InfiniteFile subs[] = f.listFiles(); for (InfiniteFile sub : subs) { if (sub.isDirectory()) { // (can only nest once) InfiniteFile subs2[] = sub.listFiles(); for (InfiniteFile sub2 : subs2) { if (sub2.getUrlString().equals(searchFile)) { return sub2; } //TOTEST } } //(end loop ove sub-dirs) else if (sub.getUrlString().equals(searchFile)) { return sub; } //TOTEST } //(end loop over dirs) } //TOTEST } //TODO (INF-2122): TOTEST else if (source.getFileConfig() == null || source.getFileConfig().password == null || source.getFileConfig().username == null) { f = InfiniteFile.create(searchFile); } else { if (source.getFileConfig().domain == null) { source.getFileConfig().domain = ""; } NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(source.getFileConfig().domain, source.getFileConfig().username, source.getFileConfig().password); f = InfiniteFile.create(searchFile, auth); } } //TESTED catch (Exception e) { int nIndex = searchFile.lastIndexOf("/"); searchFile = searchFile.substring(0, nIndex); // (ie not including the /) f = searchFileShare(source, searchFile); if (f.isDirectory()) { throw new MalformedURLException(searchFile + " is directory."); } } //TESTED return f; } // (End INF-1406 sync bug, see above explanation) } //TESTED /** * Get the list of docs * @return * @throws Exception */ private List<DocumentPojo> getFiles(SourcePojo source) throws Exception { InfiniteFile file = null; _deleteExistingFilesBySourceKey = false; try { if (source.getUrl().startsWith("inf://")) { // Infinit.e share/custom object NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication( Arrays.toString(source.getCommunityIds().toArray()), source.getOwnerId().toString(), null); file = InfiniteFile.create(source.getUrl(), auth); if (source.getUrl().startsWith("inf://custom/")) { _customJob = true; // A few cases: // 1] If first time, or source has completed: // Quick check of share/custom date vs last imported doc in this case: ObjectId customLastRecordId = null; // Here are the two cases (whether in success/error/success_iteration // 1) non-append mode ... any time the first_record.time > last_doc.time then re-run (delete all docs) // 2) append-mode ... any time the last_record.time > last_doc.time then re-run/keep going // (the status clause below just determines if you keep going or not) // the file.getTime() call will automatically give you the correct version of 1 vs 2 depending on its status) if ((null == source.getHarvestStatus()) || (HarvestEnum.success == source.getHarvestStatus().getHarvest_status())) { if (!_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) { return files; } //TESTED else { _customLastRecordWritten = _context.getDuplicateManager().getLastModifiedDate(); customLastRecordId = _context.getDuplicateManager().getLastModifiedDocId(); _context.getDuplicateManager().resetForNewSource(); // (reset the saved state since I faked my harvest status) _deleteExistingFilesBySourceKey = true; } //TESTED } else { // 2] If in the middle of a multiple harvest cycle.... // Specifically for custom, need to handle m/r changing ... we'll fake the harvest status // to force it to check the last doc's modified time vs the current file time... HarvestEnum saved = source.getHarvestStatus().getHarvest_status(); source.getHarvestStatus().setHarvest_status(HarvestEnum.success); try { if (_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) { _deleteExistingFilesBySourceKey = true; } _customLastRecordWritten = _context.getDuplicateManager().getLastModifiedDate(); customLastRecordId = _context.getDuplicateManager().getLastModifiedDocId(); _context.getDuplicateManager().resetForNewSource(); // (reset the saved state since I faked my harvest status) } finally { // (rewrite original) source.getHarvestStatus().setHarvest_status(saved); } } //TESTED if (_streaming) { // Never delete files... _deleteExistingFilesBySourceKey = false; } //TESTED if (null == customLastRecordId) { // no docs, so no need for this // (or -in the case of distributed sources- the new harvest has already begun) _deleteExistingFilesBySourceKey = false; } //TESTED // Custom append mode: never delete anything, only process new objects InternalInfiniteFile customHandle = (InternalInfiniteFile) file; if (customHandle.isAppendingNotReplacing()) { _deleteExistingFilesBySourceKey = false; } //TESTED // Finally, if we wanted to delete the files then go ahead now: if (_deleteExistingFilesBySourceKey) { // For now, support only "non-append" mode efficiently: // Always delete all the old docs, updated docs will work but inefficiently (will delete and re-create) DocumentPojo docRepresentingSrcKey = new DocumentPojo(); if (null != source.getDistributionFactor()) { // If split across multiple docs then need a more expensive delete (note: still indexed) docRepresentingSrcKey.setId(customLastRecordId); } docRepresentingSrcKey.setCommunityId(source.getCommunityIds().iterator().next()); docRepresentingSrcKey.setSourceKey(source.getKey()); this.docsToRemove.add(docRepresentingSrcKey); } //TESTED } else { // share - this is much simpler: if (!_context.getDuplicateManager().needsUpdated_Url(new Date(file.getDate()), null, source)) { return files; } //TESTED } } //TESTED else if (source.getFileConfig() == null || source.getFileConfig().password == null || source.getFileConfig().username == null) { // Local file: => must be admin to continue if (harvestSecureMode) { // secure mode, must be admin if (source.getUrl().startsWith("file:")) { if (!AuthUtils.isAdmin(source.getOwnerId())) { throw new ExtractorSourceLevelMajorException("Permission denied"); } } } //TODO (INF-2119): come up with something better than this...(this is at least consistent with SAH/UAH security, apart from allowing admin more rights) file = InfiniteFile.create(source.getUrl()); } else // Samba - note (INF-1406) has sync lock bug so needs synchronization { _bNeedsExtraSyncLock = true; if (source.getFileConfig().domain == null) { source.getFileConfig().domain = ""; } NtlmPasswordAuthentication auth = new NtlmPasswordAuthentication(source.getFileConfig().domain, source.getFileConfig().username, source.getFileConfig().password); file = InfiniteFile.create(source.getUrl(), auth); } if (_customJob) { // (no concept of depth in custom job, but use directories to avoid maxing out) maxDepth = Integer.MAX_VALUE; } traverse(file, source, maxDepth); } catch (Exception e) { // If an exception here this is catastrophic, throw it upwards: errors++; throw e; } return files; } /** * Constructor for processing doc information for a source * @param maxDepth */ public FileHarvester() { sourceTypesCanHarvest.add(InfiniteEnums.FILES); maxDepth = 5; PropertiesManager pm = new PropertiesManager(); maxDocsPerCycle = pm.getMaxDocsPerSource(); harvestSecureMode = pm.getHarvestSecurity(); } // Process the doc private void processFiles(SourcePojo source) throws Exception { // Can override system settings if less: if ((null != source.getThrottleDocs()) && (source.getThrottleDocs() < maxDocsPerCycle)) { maxDocsPerCycle = source.getThrottleDocs(); } sourceUrlsGettingUpdated = new HashSet<String>(); LinkedList<String> duplicateSources = new LinkedList<String>(); try { // Compile regexes if they are present if ((null != source.getFileConfig()) && (null != source.getFileConfig().pathInclude)) { includeRegex = Pattern.compile(source.getFileConfig().pathInclude, Pattern.CASE_INSENSITIVE); } if ((null != source.getFileConfig()) && (null != source.getFileConfig().pathExclude)) { excludeRegex = Pattern.compile(source.getFileConfig().pathExclude, Pattern.CASE_INSENSITIVE); } if ((null != source.getFileConfig()) && (null != source.getFileConfig().maxDepth)) { this.maxDepth = source.getFileConfig().maxDepth; } // Process the fileshare getFiles(source); } catch (Exception e) { // If an exception here this is catastrophic, throw it upwards: errors++; throw e; } try { //Dedup code, ironically enough partly duplicated in parse(), probably unnecessarily DuplicateManager qr = _context.getDuplicateManager(); for (DocumentPojo doc : files) { try { duplicateSources.clear(); if (null != doc.getSourceUrl()) { boolean add = true; // However still need to check for duplicates so can update entities correctly (+maintain _ids, etc) // We only do this if the source URL changes (unless URL is taken from the object in which case all bets are off) boolean sourceUrlUpdated = sourceUrlsGettingUpdated.contains(doc.getSourceUrl()); if (!doc.getHasDefaultUrl() || sourceUrlUpdated) { // src URL for a given URL // (only if the the sourceUrl is not new...) if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) { doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace) if (!sourceUrlUpdated && !_deleteExistingFilesBySourceKey) { // Here update instead so we delete the old doc and add the new one add = false; docsToUpdate.add(doc); } //TESTED else { // (else *still* don't add this to updates because we've added the source URL or source key to the delete list) // (hence approximate create with the updateId...) if (null != doc.getUpdateId()) { doc.setCreated(new Date(doc.getUpdateId().getTime())); } //TESTED } //TESTED } //(note we don't get about duplicate sources in this case - just too complex+rare a case) } //TESTED (src url changing, different src url, non-default URL) // For composite files we (almost always) delete everything that already exists (via docsToRemove) and then add new docs if (add) { docsToAdd.add(doc); } //TESTED } else if (qr.isDuplicate_Url(doc.getUrl(), source, duplicateSources)) { // Other files, if the file already exists then update it (essentially, delete/add) doc.setUpdateId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace) docsToUpdate.add(doc); } else { // if duplicateSources is non-empty then this URL is a duplicate of one from a different source if (!duplicateSources.isEmpty()) { doc.setDuplicateFrom(duplicateSources.getFirst()); } docsToAdd.add(doc); } } catch (Exception e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } } } catch (Exception e) { // If an exception here this is catastrophic, throw it upwards: errors++; throw e; } } private void parse(InfiniteFile f, SourcePojo source) throws MalformedURLException, URISyntaxException { //NOTE: we only ever break out of here because of max docs in standalone mode // (because we don't know how to continue reading) DocumentPojo doc = null; //Determine File Extension String fileName = f.getName().toString(); int mid = fileName.lastIndexOf("."); String extension = fileName.substring(mid + 1, fileName.length()); //Checked to save processing time long fileTimestamp = (f.getDate() / 1000) * 1000; // (ensure truncated to seconds, since some operation somewhere hear does this...) Date modDate = new Date(fileTimestamp); //XML Data gets placed into MetaData boolean bIsXml = false; boolean bIsJson = false; boolean bIsLineOriented = false; if ((null != source.getFileConfig()) && (null != source.getFileConfig().type)) { extension = source.getFileConfig().type; } bIsXml = extension.equalsIgnoreCase("xml"); bIsJson = extension.equalsIgnoreCase("json"); bIsLineOriented = extension.endsWith("sv"); if (bIsXml || bIsJson || bIsLineOriented) { int debugMaxDocs = Integer.MAX_VALUE; // by default don't set this, it's only for debug mode if (_context.isStandalone()) { // debug mode debugMaxDocs = maxDocsPerCycle; } //fast check to see if the file has changed before processing (or if it never existed) if (needsUpdated_SourceUrl(modDate, f.getUrlString(), source)) { if (0 != modDate.getTime()) { // if it ==0 then sourceUrl doesn't exist at all, no need to delete // This file already exists - in normal/managed mode will re-create // In streaming mode, simple skip over if (_streaming) { return; } //TESTED DocumentPojo docRepresentingSrcUrl = new DocumentPojo(); docRepresentingSrcUrl.setSourceUrl(f.getUrlString()); docRepresentingSrcUrl.setSourceKey(source.getKey()); docRepresentingSrcUrl.setCommunityId(source.getCommunityIds().iterator().next()); sourceUrlsGettingUpdated.add(docRepresentingSrcUrl.getSourceUrl()); this.docsToRemove.add(docRepresentingSrcUrl); // (can add documents with just source URL, are treated differently in the core libraries) } SourceFileConfigPojo fileSystem = source.getFileConfig(); if ((null == fileSystem) && (bIsXml || bIsJson)) { fileSystem = new SourceFileConfigPojo(); } XmlToMetadataParser xmlParser = null; JsonToMetadataParser jsonParser = null; String urlType = extension; if (bIsXml) { xmlParser = new XmlToMetadataParser(fileSystem.XmlRootLevelValues, fileSystem.XmlIgnoreValues, fileSystem.XmlSourceName, fileSystem.XmlPrimaryKey, fileSystem.XmlAttributePrefix, fileSystem.XmlPreserveCase, debugMaxDocs); } //TESTED else if (bIsJson) { jsonParser = new JsonToMetadataParser(fileSystem.XmlSourceName, fileSystem.XmlRootLevelValues, fileSystem.XmlPrimaryKey, fileSystem.XmlIgnoreValues, debugMaxDocs); } //TESTED List<DocumentPojo> partials = null; try { if (bIsXml) { XMLStreamReader xmlStreamReader = null; XMLInputFactory factory = XMLInputFactory.newInstance(); factory.setProperty(XMLInputFactory.IS_COALESCING, true); factory.setProperty(XMLInputFactory.SUPPORT_DTD, false); try { xmlStreamReader = factory.createXMLStreamReader(f.getInputStream()); partials = xmlParser.parseDocument(xmlStreamReader); long memUsage = xmlParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != xmlStreamReader) xmlStreamReader.close(); } } //TESTED else if (bIsJson) { JsonReader jsonReader = null; try { jsonReader = new JsonReader(new InputStreamReader(f.getInputStream(), "UTF-8")); jsonReader.setLenient(true); partials = jsonParser.parseDocument(jsonReader); long memUsage = jsonParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != jsonReader) jsonReader.close(); } } //TESTED else if (bIsLineOriented) { // Just generate a document for every line BufferedReader lineReader = null; try { lineReader = new BufferedReader(new InputStreamReader(f.getInputStream(), "UTF-8")); CsvToMetadataParser lineParser = new CsvToMetadataParser(debugMaxDocs); partials = lineParser.parseDocument(lineReader, source); long memUsage = lineParser.getMemUsage(); _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); } finally { if (null != lineReader) lineReader.close(); } } //TESTED MessageDigest md5 = null; // (generates unique urls if the user doesn't below) try { md5 = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { // Do nothing, unlikely to happen... } int nIndex = 0; int numPartials = partials.size(); for (DocumentPojo doctoAdd : partials) { nIndex++; doctoAdd.setSource(source.getTitle()); doctoAdd.setSourceKey(source.getKey()); doctoAdd.setMediaType(source.getMediaType()); doctoAdd.setModified(new Date(fileTimestamp)); doctoAdd.setCreated(new Date()); if (null == doctoAdd.getUrl()) { // Can be set in the parser or here doctoAdd.setHasDefaultUrl(true); // (ie cannot occur in a different src URL) if (1 == numPartials) { String urlString = f.getUrlString(); if (urlString.endsWith(urlType)) { doctoAdd.setUrl(urlString); } else { doctoAdd.setUrl( new StringBuffer(urlString).append('.').append(urlType).toString()); } // (we always set sourceUrl as the true url of the file, so want to differentiate the URL with // some useful information) } else if (null == doctoAdd.getMetadata()) { // Line oriented case doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/").append(nIndex) .append('.').append(urlType).toString()); } else { if (null == md5) { // Will never happen, MD5 always exists doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/") .append(doctoAdd.getMetadata().hashCode()).append('.').append(urlType) .toString()); } else { // This is the standard call if the XML parser has not been configured to build the URL doctoAdd.setUrl(new StringBuffer(f.getUrlString()).append("/") .append(DigestUtils.md5Hex(doctoAdd.getMetadata().toString())) .append('.').append(urlType).toString()); } } //TESTED } doctoAdd.setTitle(f.getName().toString()); doctoAdd.setPublishedDate(new Date(fileTimestamp)); doctoAdd.setSourceUrl(f.getUrlString()); // Always add to files because I'm deleting the source URL files.add(doctoAdd); } //TESTED } catch (XMLStreamException e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } catch (FactoryConfigurationError e1) { errors++; _context.getHarvestStatus().logMessage(e1.getMessage(), true); } catch (IOException e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } catch (Exception e1) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e1).toString(), true); } } //(end if needs updated) } else //Tika supports Excel,Word,Powerpoint,Visio, & Outlook Documents { // (This dedup tells me if it's an add/update vs ignore - qr.isDuplicate higher up tells me if I need to add or update) if (needsUpdated_Url(modDate, f.getUrlString(), source)) { Metadata metadata = null; InputStream in = null; try { doc = new DocumentPojo(); // Create a tika object (first time only) if (null == _tika) { this.initializeTika(_context, source); } // BUGGERY // NEED TO LIKELY SET LIMIT TO BE 30MB or 50MB and BYPASS ANYTHING OVER THAT BELOW IS THE CODE TO DO THAT // tika.setMaxStringLength(30*1024*1024); // Disable the string length limit _tika.setMaxStringLength(-1); //input = new FileInputStream(new File(resourceLocation)); // Create a metadata object to contain the metadata metadata = new Metadata(); // Parse the file and get the text of the file doc.setSource(source.getTitle()); doc.setSourceKey(source.getKey()); doc.setMediaType(source.getMediaType()); String fullText = ""; in = f.getInputStream(); try { if (null == _tikaOutputFormat) { // text only fullText = _tika.parseToString(in, metadata); } //TESTED else { // XML/HMTL _tika.getParser().parse(in, _tikaOutputFormat, metadata, _tikaOutputParseContext); fullText = _tikaXmlFormatWriter.toString(); _tikaXmlFormatWriter.getBuffer().setLength(0); } //TESTED } finally { if (null != in) in.close(); } int descCap = 500; doc.setFullText(fullText); if (descCap > fullText.length()) { descCap = fullText.length(); } doc.setDescription(fullText.substring(0, descCap)); doc.setModified(new Date(fileTimestamp)); doc.setCreated(new Date()); doc.setUrl(f.getUrlString()); doc.setTitle(f.getName().toString()); doc.setPublishedDate(new Date(fileTimestamp)); long memUsage = (250L * (doc.getFullText().length() + doc.getDescription().length())) / 100L; // 25% overhead, 2x for string->byte _memUsage += memUsage; _totalMemUsage.addAndGet(memUsage); // If the metadata contains a more plausible date then use that try { String title = metadata.get(Metadata.TITLE); if (null != title) { doc.setTitle(title); } } catch (Exception e) { // Fine just carry on } try { Date date = metadata.getDate(Metadata.CREATION_DATE); // MS Word if (null != date) { doc.setPublishedDate(date); } else { date = metadata.getDate(Metadata.DATE); // Dublin if (null != date) { doc.setPublishedDate(date); } else { date = metadata.getDate(Metadata.ORIGINAL_DATE); if (null != date) { doc.setPublishedDate(date); } } } } catch (Exception e) { // Fine just carry on } //TESTED // If the metadata contains a geotag then apply that: try { String lat = metadata.get(Metadata.LATITUDE); String lon = metadata.get(Metadata.LONGITUDE); if ((null != lat) && (null != lon)) { GeoPojo gt = new GeoPojo(); gt.lat = Double.parseDouble(lat); gt.lon = Double.parseDouble(lon); doc.setDocGeo(gt); } } catch (Exception e) { // Fine just carry on } // Save the entire metadata: doc.addToMetadata("_FILE_METADATA_", metadata); for (ObjectId communityId : source.getCommunityIds()) { doc.setCommunityId(communityId); } files.add(doc); // Close the input stream in.close(); in = null; //TESTED } catch (SmbException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (MalformedURLException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (UnknownHostException e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } catch (IOException e) { errors++; _context.getHarvestStatus().logMessage(e.getMessage(), true); } catch (TikaException e) { errors++; _context.getHarvestStatus().logMessage(e.getMessage(), true); } catch (Exception e) { errors++; _context.getHarvestStatus() .logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } finally { // Close the input stream if an error occurs if (null != in) { try { in.close(); } catch (IOException e) { // All good, do nothing } } } // end exception handling } // end dedup check } // end XML vs "office" app //DEBUG //System.out.println("FILE=" + files.size() + " / MEM=" + _memUsage + " VS " + Runtime.getRuntime().totalMemory()); } private void traverse(InfiniteFile f, SourcePojo source, int depth) throws Exception { if (depth == 0) { return; } try { InfiniteFile[] l; if (_customJob) { l = f.listFiles(_customLastRecordWritten, this.maxDocsPerCycle); } else { if (_bNeedsExtraSyncLock) { synchronized (FileHarvester.class) { l = f.listFiles(); } } else { // normal case, no synchronization needed l = f.listFiles(); } } for (int i = 0; l != null && i < l.length; i++) { if (null == l[i]) break; // (reached the end of the list) // Check what the deal with memory usage is: // (Allow 25% of current heap) if ((_totalMemUsage.get() * 4) > Runtime.getRuntime().maxMemory()) { source.setReachedMaxDocs(); break; } //TESTED // Check to see if the item is a directory or a file that needs to parsed // if it is a file then parse the sucker using tika // if it is a directory then use recursion to dive into the directory if (files.size() >= this.maxDocsPerCycle) { source.setReachedMaxDocs(); break; } if (l[i].isDirectory()) { // Directories: included unless explicity exclude: String path = l[i].getUrlPath(); boolean bProcess = true; if (_customJob && !_unshardedCustomJob && (depth == Integer.MAX_VALUE)) { // custom jobs split by directory aka shard, and only at the top... if ((null != source.getDistributionTokens()) && (null != source.getDistributionFactor())) { int split = Math.abs(path.hashCode()) % source.getDistributionFactor(); if (!source.getDistributionTokens().contains(split)) { bProcess = false; } } //TESTED (custom) } if (bProcess && (null != excludeRegex)) { if (excludeRegex.matcher(path).matches()) { bProcess = false; } } //TESTED if (bProcess) { traverse(l[i], source, depth - 1); if (source.reachedMaxDocs()) { // (express elevator back to recursion root) return; } } } else { if (_customJob && (depth == Integer.MAX_VALUE)) { // file at level 1 => custom job is unsharded _unshardedCustomJob = true; } boolean bProcess = true; // Files: check both include and exclude and distribution logic String path = l[i].getUrlPath(); // Intra-source distribution logic: if (!_customJob || _unshardedCustomJob) { // custom jobs split by directory aka shard if ((null != source.getDistributionTokens()) && (null != source.getDistributionFactor())) { int split = Math.abs(path.hashCode()) % source.getDistributionFactor(); if (!source.getDistributionTokens().contains(split)) { bProcess = false; } } //TESTED (custom and non-custom) } if (bProcess && (null != includeRegex)) { if (!includeRegex.matcher(path).matches()) { bProcess = false; } } if (bProcess && (null != excludeRegex)) { if (excludeRegex.matcher(path).matches()) { bProcess = false; } } //TESTED if (bProcess) { parse(l[i], source); // (Adds to this.files) // If we've got here, check what we should do with the file if (!_context.isStandalone()) { if ((null != source.getFileConfig()) && (null != source.getFileConfig().renameAfterParse)) { try { if (source.getFileConfig().renameAfterParse.isEmpty() || source.getFileConfig().renameAfterParse.equals(".")) { // delete it l[i].delete(); } //TESTED else { l[i].rename(createNewName(l[i], source.getFileConfig().renameAfterParse)); } //TESTED } catch (IOException e) { // doesn't seem worth bombing out but should error _context.getHarvestStatus().logMessage( HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } } //TESTED } } //(not excluded) } //(file not directory) l[i] = null; // (ie should now be able to free the memory } //(end loop over directory files) } catch (Exception e) { if (maxDepth == depth) { // Top level error, abandon ship errors++; throw e; } else { // Already had some luck with this URL keep going errors++; _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage(e).toString(), true); } } } private boolean needsUpdated_SourceUrl(Date mod, String sourceUrl, SourcePojo source) { try { DuplicateManager qr = _context.getDuplicateManager(); return qr.needsUpdated_SourceUrl(mod, sourceUrl, source); } catch (Exception e) { // Do nothing } return false; } private boolean needsUpdated_Url(Date mod, String url, SourcePojo source) { try { DuplicateManager qr = _context.getDuplicateManager(); return qr.needsUpdated_Url(mod, url, source); } catch (Exception e) { // Do nothing } return false; } @Override public boolean canHarvestType(int sourceType) { return sourceTypesCanHarvest.contains(sourceType); } @Override public void executeHarvest(HarvestContext context, SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove) { _context = context; if (_context.isStandalone()) { maxDocsPerCycle = _context.getStandaloneMaxDocs(); } try { Long lastMemUsage = _lastMemUsage.get(); if (null != lastMemUsage) { // re-using this thread => this memory is now definitely all gone _totalMemUsage.addAndGet(-lastMemUsage); } //TESTED (by hand) // Defaults to some "normal" mode that involves trying to spot existing files that have been modified and re-creating their harvested docs // In streaming mode it will just skip over those files and carry on // (It should be particularly useful for custom mode, can just re-run the same job on he last day's data and the source will keep adding them) if ((null != source.getFileConfig()) && (null != source.getFileConfig().mode) && (StreamingType.streaming == source.getFileConfig().mode)) { _streaming = true; } //logger.debug("Source: " + source.getUrl()); //create new list for files this.files = new LinkedList<DocumentPojo>(); this.docsToAdd = toAdd; this.docsToUpdate = toUpdate; this.docsToRemove = toRemove; processFiles(source); //harvested "successfully", post in mongo String logMsg = (0 == errors) ? ("") : (new StringBuffer().append(errors).append(" file error(s).").toString()); _context.getHarvestStatus().update(source, new Date(), HarvestEnum.in_progress, logMsg, false, false); } catch (Exception e) { errors++; _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, e.getMessage(), true, false); } finally { // (ie these can be deleted once the harvest is complete) this.files = null; this.docsToAdd = null; this.docsToUpdate = null; this.docsToRemove = null; // Can't remove the memory yet, because it persists until the doc is written: _lastMemUsage.set(_memUsage); } } // Renaming utility private static String createNewName(InfiniteFile subFile, String replacement) throws MalformedURLException, UnsupportedEncodingException, URISyntaxException { String path = subFile.getUrlString(); // (currently the entire string) String name = subFile.getName(); int startOfName = path.lastIndexOf(name); return replacement.replace("$name", name).replace("$path", path.substring(0, startOfName - 1)); } ///////////////////////////////////////////////////////////////////////////////////// // Get tika options: // Bonus option output:xhtml|text // Bonus option bypass:<media type> // Example option: "application/pdf:{setEnableAutoSpace:false}", ie format is mediaType:JSON // where JSON is key/value pairs for the function name and the arg (only String, bool, int/long/double types are possible) private void initializeTika(HarvestContext context, SourcePojo source) { AutoDetectParser autoDetectParser = new AutoDetectParser(); if (null != source.getFileConfig().XmlRootLevelValues) { for (String s : source.getFileConfig().XmlRootLevelValues) { int separator = s.indexOf(':'); String jsonStr = s.substring(separator + 1); if (separator > 0) { String mediaType = s.substring(0, separator); if (mediaType.equalsIgnoreCase("output")) { //special case, just going to configure output if (jsonStr.equalsIgnoreCase("xml") || jsonStr.equalsIgnoreCase("xhtml")) { _tikaXmlFormatWriter = new StringWriter(); _tikaOutputFormat = getTransformerHandler("xml", _tikaXmlFormatWriter); _tikaOutputParseContext = new ParseContext(); } if (jsonStr.equalsIgnoreCase("html")) { _tikaXmlFormatWriter = new StringWriter(); _tikaOutputFormat = getTransformerHandler("html", _tikaXmlFormatWriter); _tikaOutputParseContext = new ParseContext(); } continue; } //TESTED else if (mediaType.equalsIgnoreCase("bypass")) { Map<MediaType, Parser> parsers = autoDetectParser.getParsers(); parsers.put(MediaType.parse(jsonStr), new TXTParser()); autoDetectParser.setParsers(parsers); continue; } // Try to get media type parser: Parser p = autoDetectParser.getParsers().get(MediaType.parse(mediaType)); while (p instanceof CompositeParser) { p = ((CompositeParser) p).getParsers().get(MediaType.parse(mediaType)); } if (null == p) { context.getHarvestStatus().logMessage( "Failed to find application type " + mediaType + " in tika option: " + s, true); continue; } //TESTED // Get JSON objects and try to apply try { JsonElement jsonObj = new JsonParser().parse(jsonStr); for (Map.Entry<String, JsonElement> keyVal : jsonObj.getAsJsonObject().entrySet()) { if (keyVal.getValue().getAsJsonPrimitive().isBoolean()) { //boolean try { Method method = p.getClass().getMethod(keyVal.getKey(), Boolean.class); method.invoke(p, (Boolean) keyVal.getValue().getAsJsonPrimitive().getAsBoolean()); } catch (Exception e) { try { Method method = p.getClass().getMethod(keyVal.getKey(), Boolean.TYPE); method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsBoolean()); } catch (Exception e2) { context.getHarvestStatus().logMessage( "Failed to invoke " + keyVal.getKey() + " in tika option: " + s, true); continue; } //TESTED } } //TESTED if (keyVal.getValue().getAsJsonPrimitive().isString()) { //string try { Method method = p.getClass().getMethod(keyVal.getKey(), String.class); method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsString()); } catch (Exception e) { context.getHarvestStatus().logMessage( "Failed to invoke " + keyVal.getKey() + " in tika option: " + s, true); continue; } } //TESTED (cut and paste) if (keyVal.getValue().getAsJsonPrimitive().isNumber()) { // number: int/long/double // Loads of options: Integer.class, Integer.TYPE, Long.class, Long.TYPE, Double.long, Double.TYPE boolean invoked = false; if (!invoked) { // Int.class try { Method method = p.getClass().getMethod(keyVal.getKey(), Integer.class); method.invoke(p, (Integer) keyVal.getValue().getAsJsonPrimitive().getAsInt()); invoked = true; } catch (Exception e) { } } if (!invoked) { // Int.type try { Method method = p.getClass().getMethod(keyVal.getKey(), Integer.TYPE); method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsInt()); invoked = true; } catch (Exception e) { } } if (!invoked) { // Long.class try { Method method = p.getClass().getMethod(keyVal.getKey(), Long.class); method.invoke(p, (Long) keyVal.getValue().getAsJsonPrimitive().getAsLong()); invoked = true; } catch (Exception e) { } } if (!invoked) { // Long.type try { Method method = p.getClass().getMethod(keyVal.getKey(), Long.TYPE); method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsLong()); invoked = true; } catch (Exception e) { } } if (!invoked) { // Double.class try { Method method = p.getClass().getMethod(keyVal.getKey(), Double.class); method.invoke(p, (Double) keyVal.getValue().getAsJsonPrimitive().getAsDouble()); invoked = true; } catch (Exception e) { } } if (!invoked) { // Double.type try { Method method = p.getClass().getMethod(keyVal.getKey(), Double.TYPE); method.invoke(p, keyVal.getValue().getAsJsonPrimitive().getAsDouble()); invoked = true; } catch (Exception e) { } } } //TOTEST (all the different options) } //(end loop over options) } catch (Exception e) { context.getHarvestStatus().logMessage("Failed to parse JSON in tika option: " + s, true); } //TESTED } else { context.getHarvestStatus().logMessage("Failed to parse tika option: " + s, true); } //TESTED } //TESTED } //(end if has options) _tika = new Tika(TikaConfig.getDefaultConfig().getDetector(), autoDetectParser); }//TESTED (apart from unused number option configuration) // (See http://stackoverflow.com/questions/9051183/how-to-use-tikas-xwpfwordextractordecorator-class) private static TransformerHandler getTransformerHandler(String method, StringWriter sw) { try { SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); handler.setResult(new StreamResult(sw)); return handler; } catch (Exception e) { return null; } }//TESTED }