Java tutorial
/******************************************************************************* * Copyright 2012, The Infinit.e Open Source Project. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package com.ikanow.infinit.e.harvest.extraction.document.file; import java.io.ByteArrayInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.List; import net.sf.jazzlib.GridFSZipFile; import net.sf.jazzlib.ZipEntry; import org.bson.types.ObjectId; import com.ikanow.infinit.e.data_model.store.DbManager; import com.ikanow.infinit.e.data_model.store.MongoDbManager; import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo; import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo; import com.ikanow.infinit.e.harvest.utils.AuthUtils; import com.ikanow.utility.GridFSRandomAccessFile; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.gridfs.GridFSDBFile; import jcifs.smb.NtlmPasswordAuthentication; public class InternalInfiniteFile extends InfiniteFile { public static final String INFINITE_PREFIX = "inf://"; public static final String INFINITE_SHARE_PREFIX = "inf://share/"; public static final int INFINITE_SHARE_PREFIX_LEN = 12; public static final String INFINITE_CUSTOM_PREFIX = "inf://custom/"; public static final int INFINITE_CUSTOM_PREFIX_LEN = 13; ////////////////////////////////////////////////////////////////// // INTERFACE C'TORS public InternalInfiniteFile(String url, NtlmPasswordAuthentication auth) throws MalformedURLException { try { ObjectId locationId = null; ObjectId ownerId = null; String communityIdsStr = null; if (url.startsWith(INFINITE_SHARE_PREFIX)) { _isShare = true; locationId = new ObjectId(url.substring(INFINITE_SHARE_PREFIX_LEN).replaceFirst("/.*$", "")); // remove trailing /s, can be used for information //TESTED (2.1, 2.2.1, 2.3) BasicDBObject query = new BasicDBObject(SharePojo._id_, locationId); _resultObj = (BasicDBObject) MongoDbManager.getSocial().getShare().findOne(query); if (null == _resultObj) { throw new MalformedURLException("Not found (or not authorized): " + url); } //TESTED (7.1) String mediaType = (String) _resultObj.get(SharePojo.mediaType_); if ((null != mediaType) && (mediaType.equalsIgnoreCase("application/x-zip-compressed") || mediaType.equalsIgnoreCase("application/zip"))) { _isDirectory = true; ObjectId fileId = _resultObj.getObjectId(SharePojo.binaryId_); GridFSRandomAccessFile file = new GridFSRandomAccessFile( MongoDbManager.getSocial().getShareBinary(), fileId); _zipView = new GridFSZipFile(_resultObj.getString(SharePojo.title_), file); } //TESTED (3.1) else { // Single share if (_resultObj.containsField(SharePojo.documentLocation_)) { throw new MalformedURLException("Reference shares are not currently supported"); } //TESTED (0.1) _isDirectory = false; // (this will get traversed as the initial "directory", which doesn't check isDirectory... //... and will return itself as a single file in the "directory") } //TESTED (1.1, 2.1, 3.1) } //TESTED else if (url.startsWith(INFINITE_CUSTOM_PREFIX)) { _isCustom = true; _isDirectory = true; BasicDBObject query = null; String locationStr = url.substring(INFINITE_CUSTOM_PREFIX_LEN).replaceFirst("/.*$", ""); StringBuffer sb = new StringBuffer(INFINITE_CUSTOM_PREFIX); try { locationId = new ObjectId(locationStr); query = new BasicDBObject(CustomMapReduceJobPojo._id_, locationId); } //TESTED (4.1) catch (Exception e) { // for custom jobs can also specify the job name query = new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, locationStr); } //TESTED (5.1, 6.1) _resultObj = (BasicDBObject) MongoDbManager.getCustom().getLookup().findOne(query); if (null == _resultObj) { throw new MalformedURLException("Not found (or not authorized): " + url); } //TESTED (7.2, 7.3) if (null != locationId) { sb.append(locationStr).append('/') .append(_resultObj.getString(CustomMapReduceJobPojo.jobtitle_)).append('/'); } //TESTED (5.1, 6.1) else { sb.append(_resultObj.getObjectId(CustomMapReduceJobPojo._id_).toString()).append('/') .append(locationStr).append('/'); } //TESTED (4.1) _originalUrl = sb.toString(); _isCustomAppend = _resultObj.getBoolean(CustomMapReduceJobPojo.appendResults_, false); String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_); String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_); if (null == outputDatabase) { outputDatabase = "custommr"; } DBCollection outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); BasicDBObject sort = null; if (_isCustomAppend) { // Use time of _last_ record as file time sort = new BasicDBObject("_id", -1); } else { // Use time of _first_ record as file time sort = new BasicDBObject("_id", 1); } //TESTED DBCursor dbc = outColl.find().sort(sort).limit(1); List<DBObject> firstOrLastRecordList = dbc.toArray(); if (!firstOrLastRecordList.isEmpty()) { _overwriteTime = ((ObjectId) firstOrLastRecordList.iterator().next().get("_id")).getTime(); } else { // No records, use lastRunTime_ as backup _overwriteTime = _resultObj.getDate(CustomMapReduceJobPojo.lastRunTime_, new Date()).getTime(); } //TOTEST } //TESTED else { throw new MalformedURLException("Not recognized: " + url); } //TESTED (7.4) communityIdsStr = auth.getDomain(); ownerId = new ObjectId(auth.getUsername()); // Now do some authentication: // Check communities first since that involves no external DB queries: boolean isAuthorized = false; if (_isShare) { BasicDBList communities = (BasicDBList) _resultObj.get(SharePojo.communities_); for (Object communityObj : communities) { BasicDBObject communityDbo = (BasicDBObject) communityObj; ObjectId commId = communityDbo.getObjectId("_id"); if (communityIdsStr.contains(commId.toString())) { isAuthorized = true; break; } } } //TESTED (7.*) else { //_isCustom BasicDBList communities = (BasicDBList) _resultObj.get(CustomMapReduceJobPojo.communityIds_); for (Object communityObj : communities) { ObjectId commId = (ObjectId) communityObj; if (communityIdsStr.equals(commId)) { isAuthorized = true; break; } } } //TESTED (7.*) if (!isAuthorized) { // Still OK ... only if user is an admin isAuthorized = AuthUtils.isAdmin(ownerId); } //TESTED (1,2,3,4,5,6) if (!isAuthorized) { // Permission fail throw new MalformedURLException("Not found (or not authorized): " + url); } //TESTED (7.5) } catch (Exception e) { throw new MalformedURLException("Invalid authentication or location: " + e.getMessage()); } //(just passed exceptions on) // Save original URL if (_isShare) { // (custom handled above) if (!url.endsWith("/")) { _originalUrl = url + "/"; } else { _originalUrl = url; } } //(TESTED 1.3, 2.3, 3.3) }//TESTED ////////////////////////////////////////////////////////////////// // INTERNAL C'TORS // Share/ZIP file protected InternalInfiniteFile(InternalInfiniteFile parent, String zipFilename) { _resultObj = parent._resultObj; _zipView = parent._zipView; _zipViewFilename = zipFilename; _zipEntry = _zipView.getEntry(_zipViewFilename); _isDirectory = false; _originalUrl = parent._originalUrl; _isShare = true; }//TESTED (3.2) // Custom/virtual directory protected InternalInfiniteFile(InternalInfiniteFile parent, ObjectId startId, ObjectId endId) { _resultObj = parent._resultObj; _virtualDirStartLimit = startId; _virtualDirEndLimit = endId; _isDirectory = true; _originalUrl = parent._originalUrl; _isCustom = true; if (null != endId) { _overwriteTime = endId.getTime(); } else if (null != startId) { _overwriteTime = startId.getTime(); } else _overwriteTime = parent._overwriteTime; }//TESTED (6.2.2) (custom _overwriteTime by hand) // Custom/file protected InternalInfiniteFile(InternalInfiniteFile parent, BasicDBObject document) { _resultObj = document; _isDirectory = false; _originalUrl = parent._originalUrl; _isCustom = true; Object id = _resultObj.get("_id"); if ((null != id) && (id instanceof ObjectId)) { _overwriteTime = ((ObjectId) id).getTime(); } else _overwriteTime = parent._overwriteTime; // (backup for odd/old custom jobs) }//TESTED (4.2) (custom _overwriteTime by hand) ////////////////////////////////////////////////////////////////// // INTERFACE METHODS // INTERNAL SPECIFIC // For custom jobs - if this is an incremental job then process slightly differently public boolean isAppendingNotReplacing() { return _isCustomAppend; } // OVERRIDING @Override public InputStream getInputStream() throws IOException { if (!_isDirectory) { if (_isShare && (null == _zipView)) { String jsonShare = (String) _resultObj.get(SharePojo.share_); if (null != jsonShare) { return new ByteArrayInputStream(jsonShare.toString().getBytes()); } //TESTED (1.4) else { // must be binary GridFSDBFile file = DbManager.getSocial().getShareBinary() .find(_resultObj.getObjectId(SharePojo.binaryId_)); return file.getInputStream(); } //TESTED (2.4) } else if (_isShare) { // then must be a zip file try { return _zipView.getInputStream(_zipEntry); } catch (IOException e) { throw new FileNotFoundException(e.getMessage()); } } //TESTED (3.2.1) else if (_isCustom) { return new ByteArrayInputStream(_resultObj.toString().getBytes()); } //TESTED (4.2.1) } return null; } @Override public InfiniteFile[] listFiles() { return listFiles(null, Integer.MAX_VALUE); } @Override public InfiniteFile[] listFiles(Date optionalFilterDate, int maxDocsPerCycle) { if (_isDirectory) { if (_isShare) { // must be a zip file ArrayList<InfiniteFile> zipFiles = new ArrayList<InfiniteFile>(); @SuppressWarnings("unchecked") Enumeration<net.sf.jazzlib.ZipEntry> entries = _zipView.entries(); while (entries.hasMoreElements()) { net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement(); InternalInfiniteFile newFile = new InternalInfiniteFile(this, zipInfo.getName()); zipFiles.add(newFile); } return zipFiles.toArray(new InfiniteFile[zipFiles.size()]); } //TESTED (3.2) else if (_isCustom) { // create some virtual directories eg at most 10K per "virtual directory" String outputDatabase = _resultObj.getString(CustomMapReduceJobPojo.outputDatabase_); String outputCollection = _resultObj.getString(CustomMapReduceJobPojo.outputCollection_); if (null == outputDatabase) { outputDatabase = "custommr"; } DBCollection outColl = null; DBCursor dbc = null; if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit)) { // Actual directory DBCollection chunks = MongoDbManager.getCollection("config", "chunks"); StringBuffer ns = new StringBuffer(outputDatabase).append(".").append(outputCollection); dbc = chunks.find(new BasicDBObject("ns", ns.toString())); int splits = dbc.count(); if (splits < 2) { // Nothing to do (unsharded or 1 chunk) dbc.close(); outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(); } //TESTED (4.2) else { // Create one virtual dir per split InfiniteFile[] virtualDirs = new InfiniteFile[splits]; int added = 0; for (DBObject splitObj : dbc) { BasicDBObject minObj = (BasicDBObject) splitObj.get("min"); BasicDBObject maxObj = (BasicDBObject) splitObj.get("max"); ObjectId minId = null; try { minId = (ObjectId) minObj.get("_id"); } catch (Exception e) { } // min key.. ObjectId maxId = null; try { maxId = (ObjectId) maxObj.get("_id"); } catch (Exception e) { } // max key.. //Handle current case where custom jobs are all dumped in with the wrong _id type if ((null != minId) || (null != maxId)) { if ((null != maxId) && (null != optionalFilterDate)) { // (also used on the files below) if (maxId.getTime() < optionalFilterDate.getTime()) { // (the "getTime()"s can overlap across chunks so we have to use minId // and accept that we'll often deserialize 1+ extra chunk every harvest) continue; } } //TESTED (by hand) InternalInfiniteFile split = new InternalInfiniteFile(this, minId, maxId); virtualDirs[added] = split; added++; } //TESTED (5.2.2, 6.2.2) (chunk skipping by hand) } dbc.close(); return virtualDirs; } //TESTED (5.2.2, 6.2.2) } //TESTED else { // Virtual directory BasicDBObject query = new BasicDBObject(); if (null != _virtualDirStartLimit) { if (null != optionalFilterDate) { ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); //(zero out the inc/machine ids so this query is independent to calling service) if (altStartId.compareTo(_virtualDirStartLimit) > 0) { // (altStartId > _virtualDirStartLimit) query.put(MongoDbManager.gte_, altStartId); } else { query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } //TESTED (by hand) else { // normal case query.put(MongoDbManager.gte_, _virtualDirStartLimit); } } else if (null != optionalFilterDate) { // (first chunk so always overwrite with optionalFilter date if applicable) ObjectId altStartId = new ObjectId((int) (optionalFilterDate.getTime() / 1000L), 0, 0); query.put(MongoDbManager.gte_, altStartId); } //TESTED (by hand) if (null != _virtualDirEndLimit) { query.put(MongoDbManager.lt_, _virtualDirEndLimit); } outColl = MongoDbManager.getCollection(outputDatabase, outputCollection); dbc = outColl.find(new BasicDBObject("_id", query)).limit(1 + maxDocsPerCycle); } //TESTED (6.2.2) (doc skipping by hand) if (null != outColl) { // has files, create the actual file objects //DEBUG //System.out.println("CHUNK: GOT " + dbc.count()); int docCount = dbc.count(); if (docCount > 1 + maxDocsPerCycle) { docCount = 1 + maxDocsPerCycle; // (we're limiting it here anyway) } InfiniteFile[] docs = new InfiniteFile[docCount]; int added = 0; for (DBObject docObj : dbc) { // (if didn't use a query then apply internal filter date by hand) if ((null == _virtualDirStartLimit) && (null == _virtualDirEndLimit) && (null != optionalFilterDate)) { ObjectId docId = (ObjectId) docObj.get("_id"); if (optionalFilterDate.getTime() > docId.getTime()) { continue; } } //TESTED if (added >= maxDocsPerCycle) { // (we've reached our limit so put the remaining docs in a new directory, will only be used if it has to) docs[added] = new InternalInfiniteFile(this, (ObjectId) docObj.get("_id"), _virtualDirEndLimit); break; } else { InternalInfiniteFile doc = new InternalInfiniteFile(this, (BasicDBObject) docObj); docs[added] = doc; } //TESTED (both cases) added++; } dbc.close(); return docs; } //TESTED (4.2) } } else { // can just return myself InfiniteFile[] retVal = new InfiniteFile[1]; retVal[0] = this; return retVal; } //TESTED (1.2, 2.2) return null; } //delete and rename will just call the InfiniteFile versions, which will exception out @Override public boolean isDirectory() throws IOException { return _isDirectory; } @Override public String getUrlString() throws MalformedURLException, URISyntaxException { return _originalUrl + getName(); }//TESTED (1.2, 1.3, 2.2, 2.3. 3.2.1, 3.3, 4.2.1, 4.3) @Override public String getUrlPath() throws MalformedURLException, URISyntaxException { if (null != this._virtualDirStartLimit) { return getUrlString().substring(5) + this._virtualDirStartLimit.toString(); } else { return getUrlString().substring(5); } }//TESTED (1.2, 1.3, 2.2, 2.3. 3.2.1, 3.3, 4.2.1, 4.3) @Override public URI getURI() throws MalformedURLException, URISyntaxException { return new URI("inf", "", getUrlString().substring(5), null, null); //(this odd construct is needed to handle spaces in paths) }//TESTED (1.2, 1.3, 2.2, 2.3. 3.2.1, 3.3, 4.2.1, 4.3) @Override public String getName() { if (null != _zipEntry) { return _zipViewFilename; } //TESTED (3.2.1) else if (_isShare) { // (this is both a dir and a file) return _resultObj.getString(SharePojo.title_); } //TESTED (1.2, 1.3, 2.2, 2.3, 3.3) else { // _isCustom if (_isDirectory) { // top level or virtual directory - returns no name return ""; } //TESTED (4.3) else { // just make it _id, it's the user's responsibility to assign a primary key if you need to keep this unique ObjectId updateId = _resultObj.getObjectId("_updateId", null); if (null == updateId) { return _resultObj.getObjectId("_id").toString(); } else { // I am a modified old object so retain my name for dedup... return updateId.toString(); } //TOTEST } //TESTED (4.2.1) } }//TESTED @Override public long getDate() { if (null != _overwriteTime) { return _overwriteTime; } if (_isShare) { return (_overwriteTime = _resultObj.getDate(SharePojo.modified_, new Date()).getTime()); } //Custom will always have _overwriteTime, so this is just to avoid compiler error return 0L; }//TESTED (1.2, 1.3, 2.2, 2.3, 3.2, 3.3, 4.2, 4.3) ////////////////////////////////////////////////////////////////// // STATE protected BasicDBObject _resultObj = null; // (can be the parent object or a child object) protected boolean _isDirectory = false; protected String _originalUrl = null; protected boolean _isShare = false; protected boolean _isCustom = false; protected boolean _isCustomAppend = false; // Custom state: protected ObjectId _virtualDirStartLimit = null; protected ObjectId _virtualDirEndLimit = null; // Share stuff: protected GridFSZipFile _zipView = null; // (always the parent zip) protected ZipEntry _zipEntry = null; protected String _zipViewFilename = null; // (just for display) }