Java tutorial
/* * Copyright 2003-2004 Michael Franken, Zilverline. * * The contents of this file, or the files included with this file, are subject to * the current version of ZILVERLINE Collaborative Source License for the * Zilverline Search Engine (the "License"); You may not use this file except in * compliance with the License. * * You may obtain a copy of the License at * * http://www.zilverline.org. * * See the License for the rights, obligations and * limitations governing use of the contents of the file. * * The Original and Upgraded Code is the Zilverline Search Engine. The developer of * the Original and Upgraded Code is Michael Franken. Michael Franken owns the * copyrights in the portions it created. All Rights Reserved. * */ package org.zilverline.service; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Enumeration; import java.util.Iterator; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.springframework.util.StringUtils; import org.zilverline.core.DocumentCollection; import org.zilverline.core.ExtractorFactory; import org.zilverline.core.FileSystemCollection; import org.zilverline.core.Handler; import org.zilverline.core.IndexException; import org.zilverline.dao.CollectionManagerDAO; import org.zilverline.dao.DAOException; import org.zilverline.util.FileUtils; import org.zilverline.util.SysUtils; /** * The CollectionManagerImpl holds all collections, and base values for them. * * <p> * NB. This Bean gets instantiated in web-servlet.xml. * </p> * * @author Michael Franken * @version $Revision: 1.28 $ * * @see org.zilverline.core.FileSystemCollection */ public class CollectionManagerImpl implements CollectionManager { /** * @return Returns the analyzer. */ public String getAnalyzer() { return analyzer; } /** * Gets a collection by id. * * @param theId The id of the collection * * @return Collection or null if not found */ public DocumentCollection getCollection(final Long theId) { Iterator li = collections.iterator(); while (li.hasNext()) { DocumentCollection c = (DocumentCollection) li.next(); if (theId.equals(c.getId())) { return c; } } return null; } /** * DAO object taking care of persistence for CollectionManager and its collections. */ private transient CollectionManagerDAO dao; /** * MergeFactor for indexing process. */ private Integer mergeFactor; /** * minMergeDocs for indexing process. */ private Integer minMergeDocs; /** * maxMergeDocs for indexing process. */ private Integer maxMergeDocs; /** * priority for indexing process. */ private Integer priority = new Integer(2); /** * @return Returns the dao. */ public CollectionManagerDAO getDao() { return dao; } /** * Set the DAO for this CollectionManager. * * @param thisDao The dao to set. */ public void setDao(final CollectionManagerDAO thisDao) { this.dao = thisDao; } /** logger for Commons logging. */ private static Log log = LogFactory.getLog(CollectionManagerImpl.class); /** * String representation of Analyzer. Stored to present to user for selection. */ private String analyzer = "org.apache.lucene.analysis.standard.StandardAnalyzer"; /** Array containing all available Analyzers. */ private transient String[] allAnalyzers; /** * The Analyzer to be used in indexing and searching. StandardAnalyzer by default. */ private transient Analyzer analyzerObject = new StandardAnalyzer(); /** * The default cache base directory for all collections. The cache is the directory on disk where zipped content is unzipped for * indexing. By default use WEB-INF/cache */ private File cacheBaseDir = new File(new File(this.getClass().getResource("/").getFile()).getParentFile(), "cache"); /** The set of collections this CollectionManagerImpl manages. */ // TODO refactor this into a Set private List collections = new ArrayList(); /** * The default index base directory for all collections. The index is the directory on disk where a Lucene index is stored. By * default use WEB-INF/index * * @see org.apache.lucene.index.IndexReader */ private File indexBaseDir = new File(new File(this.getClass().getResource("/").getFile()).getParentFile(), "index"); /** The default for all collections whether to keep cache dir after indexing. */ private boolean keepCache = false; /** * The handler of archives, contains mappings for file extension to unarchiving programs. */ private Handler archiveHandler; /** * Factory containing Extractor mapppings. */ private ExtractorFactory factory = new ExtractorFactory(); /** * Array containing all Extractors by name. */ private transient String[] allExtractors; /** * Add or updates collection to list of collections, and sets the manager. * * Update occurs when a collection with the same id is found in the collections. Assigns an ID to a new Collection. * * @param col Collection containing documents */ public void addCollection(final DocumentCollection col) { long maxId = 0L; // TODO: Not so efficient and clean if (col.getId() != null) { // try to find previous occurence in list for (int i = 0; i < collections.size(); i++) { DocumentCollection thisCollection = (DocumentCollection) collections.get(i); maxId = Math.max(maxId, thisCollection.getId().longValue()); log.debug("max ID: " + maxId); if (col.getId().equals(thisCollection.getId())) { log.debug("Updating collection " + col.getId() + ", " + col.getName() + " to Manager at location " + i); collections.set(i, col); thisCollection.setManager(this); return; } } } else { for (int i = 0; i < collections.size(); i++) { DocumentCollection thisCollection = (DocumentCollection) collections.get(i); maxId = Math.max(maxId, thisCollection.getId().longValue()); log.debug("max ID: " + maxId); } } // it must be a new, or non existing collection, add it col.setId(new Long(maxId + 1L)); log.debug("Adding collection " + col.getId() + ", " + col.getName() + " to Manager at end"); collections.add(col); col.setManager(this); } /** * Deletes collection from list of collections. * * @param col Collection containing documents */ public void deleteCollection(final DocumentCollection col) { for (int i = 0; i < collections.size(); i++) { DocumentCollection thisCollection = (DocumentCollection) collections.get(i); if (col.getId().equals(thisCollection.getId())) { collections.remove(i); break; } } } /** * Indicates whether any indexing is going on. * * @return true if so. */ public boolean isIndexingInProgress() { for (int i = 0; i < collections.size(); i++) { DocumentCollection thisCollection = (DocumentCollection) collections.get(i); if (thisCollection.isIndexingInProgress()) { return true; } } return false; } /** * Store the CollectionManager to store. * * @throws IndexException when collectionManager can not be saved to underlying store */ public void store() throws IndexException { if (dao != null) { try { dao.store(this); } catch (DAOException e) { throw new IndexException("Can not save IndexService", e); } } else { log.error("No DAO set for IndexService"); } } /** * Returns an Analyzer for this collection based on configuration. * * @return the Analyzer used to index and search this collection */ // TODO: rework this, this should actually be getAnalyzer, but that doesn't // work because of hack above. public Analyzer createAnalyzer() { return analyzerObject; } /** * Get the cache base directory. * * @return String the directory where the cache sits */ public File getCacheBaseDir() { return cacheBaseDir; } /** * Gets a collection by name. * * @param theName The name of the collection * * @return Collection or null if not found */ public DocumentCollection getCollectionByName(final String theName) { if (theName == null) { return null; } Iterator li = collections.iterator(); while (li.hasNext()) { DocumentCollection c = (DocumentCollection) li.next(); if (theName.equals(c.getName())) { return c; } } return null; } /** * Get all collections. * * @return collections List of collections */ public List getCollections() { return collections; } /** * Get the base directory for the index. * * @return the directory */ public File getIndexBaseDir() { return indexBaseDir; } /** * Initializes all collections. * * @throws IndexException if one of the collections can not be initialized. */ public void init() throws IndexException { allExtractors = ExtractorFactory.findExtractorsOnClasspath(); allAnalyzers = Handler.findAnalyzersOnClasspath(); CollectionManager thatManager = dao.load(); if (thatManager != null) { this.cacheBaseDir = thatManager.getCacheBaseDir(); this.indexBaseDir = thatManager.getIndexBaseDir(); this.keepCache = thatManager.isKeepCache(); this.analyzer = thatManager.getAnalyzer(); this.archiveHandler = thatManager.getArchiveHandler(); this.factory = thatManager.getFactory(); this.priority = thatManager.getPriority(); this.mergeFactor = thatManager.getMergeFactor(); this.maxMergeDocs = thatManager.getMaxMergeDocs(); this.minMergeDocs = thatManager.getMinMergeDocs(); // if there is nothing, probably first time Zilverline runs collections.clear(); Iterator li = thatManager.getCollections().iterator(); try { while (li.hasNext()) { DocumentCollection c = (DocumentCollection) li.next(); log.debug("Adding collection to manager: " + c.getName()); this.addCollection(c); c.init(); } } catch (IndexException e) { throw new IndexException("Error initializing all indexes in CollectionManagerImpl", e); } } else { // possibly first time Zilverline runs setFactory(new ExtractorFactory()); setArchiveHandler(new Handler()); if (getCollections().isEmpty()) { // try { // // make an initial Collection // Properties props = System.getProperties(); // log.debug(props.toString()); // String profile = System.getProperty("user.home"); // if (profile != null) { // // probably on Windows // File myDocs = new File(profile); // if (myDocs.isDirectory()) { // log.info("Welcome to Zilverline. Creating initial collection at: " + myDocs); // // create a Collection // FileSystemCollection myCollection = new FileSystemCollection(); // myCollection.setName("My Documents"); // myCollection.setContentDir(myDocs); // addCollection(myCollection); // myCollection.init(); // myCollection.indexInThread(true); // } else { // log.debug("Can't find " + myDocs + ", skip creating default collection"); // } // } else { // log.debug("Can't find profile, skip creating default collection"); // } // // } catch (IndexException e) { // log.warn("Can't initialize, skip creating default collection", e); // } } } } /** * The default for all collections whether to keep cache dir after indexing. * * @return whether to keep the cache or not. */ public boolean isKeepCache() { return keepCache; } /** * Create an Analyzer as specified by the given String. * * @param analyzerClassName the name of the class. The class needs to be available on the classpath. */ public void setAnalyzer(final String analyzerClassName) { try { if (analyzerClassName != null) { analyzer = analyzerClassName; Class c = Class.forName(analyzerClassName); if (c != null) { log.debug("Returning Analyzer: " + analyzerClassName); analyzerObject = (Analyzer) c.newInstance(); } } } catch (InstantiationException e1) { log.debug("Can not initiate Analyzer '" + analyzerClassName, e1); } catch (IllegalAccessException e1) { log.debug("Can not access Analyzer " + analyzerClassName, e1); } catch (ClassNotFoundException e) { log.debug("Class not found: " + analyzerClassName, e); } } /** * The default cache base directory for all collections. The cache is the directory on disk where zipped content is unzipped for * indexing. * * @param thisDir the directory on disk */ public void setCacheBaseDir(final File thisDir) { cacheBaseDir = thisDir; } /** * The default index base directory for all collections. The index is the directory on disk where a Lucene index is stored. * * @param thisDir the directory on disk * * @see org.apache.lucene.index.IndexReader */ public void setIndexBaseDir(final File thisDir) { indexBaseDir = thisDir; } /** * Indicates whether a Collection cache should be kept after indexing. The value of this CollectionManagerImpl functions as * default for all Collections. * * @param b keep cache or not. */ public void setKeepCache(final boolean b) { keepCache = b; } /** * @return Returns the allAnalyzers. */ public String[] getAllAnalyzers() { return allAnalyzers; } /** * get the ArchiveHandler, which contains the mappings for unArchiving archives. * * @return object containing mappings for handling archives */ public Handler getArchiveHandler() { return archiveHandler; } /** * @return Returns the factory. */ public ExtractorFactory getFactory() { return factory; } /** * Set the ArchiveHandler. * * @param handler object containing mappings for handling archives */ public void setArchiveHandler(final Handler handler) { archiveHandler = handler; } /** * @param thatFactory The factory to set. */ public void setFactory(final ExtractorFactory thatFactory) { this.factory = thatFactory; } /** * Expands Archives to disk. This is used is 'on-the-fly' extraction from cache * * @param col the Collection to which cache this archive is extracted * @param zip the archive or directory that might contain archives * * @return true if archive(s) could be extracted * * @throws IndexException on error * * @see org.zilverline.web.CacheController */ public boolean expandArchive(final FileSystemCollection col, final File zip) throws IndexException { log.debug("getFromCache: document " + zip + " from : " + col.getName()); if (!zip.exists()) { log.warn(zip + " does not exist."); return false; } // in the recursion we could have a directory if (zip.isDirectory()) { File[] files = zip.listFiles(); for (int i = 0; i < files.length; i++) { expandArchive(col, files[i]); } } else { String extension = FileUtils.getExtension(zip); if ((archiveHandler != null) && archiveHandler.canUnPack(extension)) { // we have an archive log.debug(zip + " is an archive"); File dir = null; if (StringUtils.hasText(archiveHandler.getUnArchiveCommand(extension))) { // this is a zip: handle with java's zip capabilities log.debug(zip + " is a zip file"); dir = unZip(zip, col); } else { log.debug(zip + " is a external archive file"); dir = unPack(zip, col); } // recursively handle all archives in this one log.debug("Recurse into " + dir); File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { expandArchive(col, files[i]); } return true; } else { if (archiveHandler == null || archiveHandler.getMappings() == null) { log.warn("Can't extract this type, no archiveHandler"); } else { log.warn("Can't extract this type, not a supported extension: " + extension); } } } return false; } /** * 'unpacks' a given archive file into cache directory with derived name. e.g. c:\temp\file.chm wil be unpacked into * [cacheDir]\file_chm\. * * @param sourceFile the Archive file to be unpacked * @param thisCollection the collection whose cache and contenDir is used * * @return File (new) directory containing unpacked file, null if unknown Archive */ public File unPack(final File sourceFile, final FileSystemCollection thisCollection) { File unPackDestinationDirectory = null; // based on file extension, lookup in the archiveHandler whether this is a known archive // we don't really have to do this, since it is already been checked in the calling indexDocs(), but better safe then sorry if (archiveHandler == null) { // we have an unknown archive log.warn("No archiveHandler found while trying to unPack " + sourceFile); return null; } String extension = FileUtils.getExtension(sourceFile); if (!archiveHandler.canUnPack(extension)) { // we have an unknown archive log.warn("No archiveHandler found for " + sourceFile); return null; } // Create destination where file will be unpacked unPackDestinationDirectory = file2CacheDir(sourceFile, thisCollection); log.debug("unpacking " + sourceFile + " into " + unPackDestinationDirectory); // get the command from the map by supplying the file extension String unArchiveCommand = archiveHandler.getUnArchiveCommand(extension); if (SysUtils.execute(unArchiveCommand, sourceFile, unPackDestinationDirectory)) { log.info("Executed: " + unArchiveCommand + " " + sourceFile + " in " + unPackDestinationDirectory); } else { log.warn( "Can not execute " + unArchiveCommand + " " + sourceFile + " in " + unPackDestinationDirectory); } // delete the archive file if it is in the cache, we don't need to // store it, since we've extracted the contents if (FileUtils.isIn(sourceFile, thisCollection.getCacheDirWithManagerDefaults())) { sourceFile.delete(); } return unPackDestinationDirectory; } /** * Takes a file and creates a directory with a derived name in the cacheDir. If the file was not already in cache, and sits in * contentDir it is mapped to cache, otherwise it stays within cache. * * <p> * e.g. given cachedir <code>c:\temp\</code> and contentdir <code>e:\docs\Projects\lucene\content\</code>, * <code>e:\docs\Projects\lucene\content\books.zip</code> yields <code>c:\temp\books_zip\</code> * </p> * * <p> * <code>c:\temp\books.zip</code> yields <code>c:\temp\books_zip\</code> * </p> * * @param sourceFile the file to be used as name for the directory * @param thisCollection the collection (not null) whose cache and contenDir is used * * @return File the (newly created) directory */ public static File file2CacheDir(final File sourceFile, final FileSystemCollection thisCollection) { log.debug("Entering file2Dir, with " + sourceFile + ", for collection:" + thisCollection.getName()); File unZipDestinationDirectory = null; try { File cacheDir = thisCollection.getCacheDirWithManagerDefaults(); // just to be sure the cacheDir exists if (!cacheDir.isDirectory()) { if (!cacheDir.mkdirs()) { log.warn("Can't create cache directory " + cacheDir); return null; } } // get the full path (not just the name, since we could have recursed into newly created directory String destinationDirectory = sourceFile.getCanonicalPath(); // change extension into _ int index = destinationDirectory.lastIndexOf('.'); String extension; if (index != -1) { extension = destinationDirectory.substring(index + 1); destinationDirectory = destinationDirectory.substring(0, index) + '_' + extension; } // if sourceFile still sits in contentdir it must be mapped to cache String collectionPath = thisCollection.getContentDir().getCanonicalPath(); if (destinationDirectory.startsWith(collectionPath)) { // chop off the first part (collectionPath) String relativePath = destinationDirectory.substring(collectionPath.length()); unZipDestinationDirectory = new File(thisCollection.getCacheDirWithManagerDefaults(), relativePath); log.debug( "Mapped " + relativePath + " to cache: " + thisCollection.getCacheDirWithManagerDefaults()); } else { unZipDestinationDirectory = new File(destinationDirectory); } // actually create the directory boolean canCreate = unZipDestinationDirectory.mkdirs(); if (!canCreate) { log.warn("Could not create: " + unZipDestinationDirectory); } log.debug("Created: " + unZipDestinationDirectory + " from File: " + sourceFile); } catch (Exception e) { log.error("error creating directory from file: " + sourceFile, e); } return unZipDestinationDirectory; } /** * unZips a given zip file into cache directory with derived name. e.g. c:\temp\file.zip wil be unziiped into * [cacheDir]\file_zip\. * * @param sourceZipFile the ZIP file to be unzipped * @param thisCollection the collection whose cache and contenDir is used * * @return File (new) directory containing zip file */ public static File unZip(final File sourceZipFile, final FileSystemCollection thisCollection) { // specify buffer size for extraction final int aBUFFER = 2048; File unzipDestinationDirectory = null; ZipFile zipFile = null; FileOutputStream fos = null; BufferedOutputStream dest = null; BufferedInputStream bis = null; try { // Specify destination where file will be unzipped unzipDestinationDirectory = file2CacheDir(sourceZipFile, thisCollection); log.info("unzipping " + sourceZipFile + " into " + unzipDestinationDirectory); // Open Zip file for reading zipFile = new ZipFile(sourceZipFile, ZipFile.OPEN_READ); // Create an enumeration of the entries in the zip file Enumeration zipFileEntries = zipFile.entries(); while (zipFileEntries.hasMoreElements()) { // grab a zip file entry ZipEntry entry = (ZipEntry) zipFileEntries.nextElement(); String currentEntry = entry.getName(); log.debug("Extracting: " + entry); File destFile = new File(unzipDestinationDirectory, currentEntry); // grab file's parent directory structure File destinationParent = destFile.getParentFile(); // create the parent directory structure if needed destinationParent.mkdirs(); // extract file if not a directory if (!entry.isDirectory()) { bis = new BufferedInputStream(zipFile.getInputStream(entry)); int currentByte; // establish buffer for writing file byte[] data = new byte[aBUFFER]; // write the current file to disk fos = new FileOutputStream(destFile); dest = new BufferedOutputStream(fos, aBUFFER); // read and write until last byte is encountered while ((currentByte = bis.read(data, 0, aBUFFER)) != -1) { dest.write(data, 0, currentByte); } dest.flush(); dest.close(); bis.close(); } } zipFile.close(); // delete the zip file if it is in the cache, we don't need to store // it, since we've extracted the contents if (FileUtils.isIn(sourceZipFile, thisCollection.getCacheDirWithManagerDefaults())) { sourceZipFile.delete(); } } catch (Exception e) { log.error("Can't unzip: " + sourceZipFile, e); } finally { try { if (fos != null) { fos.close(); } if (dest != null) { dest.close(); } if (bis != null) { bis.close(); } } catch (IOException e1) { log.error("Error closing files", e1); } } return unzipDestinationDirectory; } /** * @return Returns the allExtractors. */ public String[] getAllExtractors() { return allExtractors; } /** * @return Returns the mergeFactor. */ public Integer getMergeFactor() { return mergeFactor; } /** * @param mergeFactor The mergeFactor to set. */ public void setMergeFactor(Integer mergeFactor) { this.mergeFactor = mergeFactor; } /** * @return Returns the priority. */ public Integer getPriority() { return priority; } /** * @param priority The priority to set. */ public void setPriority(Integer priority) { this.priority = priority; } /** * @return Returns the maxMergeDocs. */ public Integer getMaxMergeDocs() { return maxMergeDocs; } /** * @param maxMergeDocs The maxMergeDocs to set. */ public void setMaxMergeDocs(Integer maxMergeDocs) { this.maxMergeDocs = maxMergeDocs; } /** * @return Returns the minMergeDocs. */ public Integer getMinMergeDocs() { return minMergeDocs; } /** * @param minMergeDocs The minMergeDocs to set. */ public void setMinMergeDocs(Integer minMergeDocs) { this.minMergeDocs = minMergeDocs; } }