Java tutorial
/* * Copyright (c) 2010- Austrian Research Institute for Artificial Intelligence (OFAI). * Copyright (C) 2014-2016 The University of Sheffield. * * This file is part of gateplugin-VirtualCorpus * (see https://github.com/johann-petrak/gateplugin-VirtualCorpus) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation, either * version 3 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this software. If not, see <http://www.gnu.org/licenses/>. */ package at.ofai.gate.virtualcorpus; import java.io.IOException; import java.io.File; import java.net.MalformedURLException; import java.net.URL; import java.util.List; import java.util.ArrayList; import java.util.Map; import java.util.ListIterator; import java.util.Iterator; import java.util.HashMap; import java.util.Collection; import gate.*; import gate.corpora.DocumentImpl; import gate.creole.*; import gate.creole.metadata.*; import gate.event.CorpusEvent; import gate.event.CorpusListener; import gate.event.CreoleEvent; import gate.event.CreoleListener; import gate.persist.PersistenceException; import gate.util.*; import gate.util.persistence.PersistenceManager; import java.util.HashSet; import java.util.Set; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; // TODO: use DocumentFormat.getSupportedFileSuffixes() to get the list of // supported input file extensions, unless the user limits those through // a parameter. // If we enable gzip-compression then we also add all the above extensions // with .gz appended. // We allow to save back the files in the following formats: .xml .xml.gz and, // if the plugin is loaded and finf is supported, finf. // QUESTION: is it possible to use a runtime-generated list as a default list // for an init parameter to choose from / correct? // BIGGER change: by default only support formats which can be written back, // which would be xml, xml.gz and finf. In that case we may want to just // write back to the same file as we read from, no matter what. // But if additional read/only extensions are specified, then we may want // to give the format to use for writing back? // OR: create different directory corpora: read only corpus which supports // all formats, but must save with "Save" option and ReadWrite corpus which // only supports the formats which can be used both for reading and writing. // We could merge in the code from convertFormat to seperate out the // format conversion functionality! /** * A Corpus LR that mirrors files in a directory. In the default configuration, * just the <code>directoryURL</code> parameter is specified at creation and * all files that have a file extension of ".xml" and are not hidden are * accessible as documents through that corpus and automatically written back * to the directory when sync'ed or when unloaded (which does an implicit sync). * If the parameter <code>outDirectoryURL</code> * is also specified, the corpus reflects all the files from the * <code>directoryURL</code> directory but writes any changed documents into * the directory <code>outDirectoryURL</code>. If the parameter * <code>saveDocuments</code> is set to false, nothing is ever written * to either of the directories. * <p> * The main purpose of this Corpus implementation is that through it * a serial controller * can directly read and write from files stored in a directory. * This makes it much easier to share working pipelines between pipeline * developers, especially when the pipeline files are checked into SCS. * <p> * This LR does not implement the following methods: * <ul> * <li>toArray: none of the toArray methods is implemented. * </ul> * If the parameter "transientCorpus" is false, * this corpus LR automatically uses a "dummy datastore" internally. * This datastore is created and removed automatically when the corpus LR is * created and removed. This datastore cannot be used for anything useful, it * does not allow listing of resources or storing of anything but documents * that are already in the corpus. It is mainly here because GATE assumes that * documents are either transient or from a datastore. To avoid documents from * a DirectoryCorpus to get treated as transient documents, their DataStore is * set to this dummy DataStore. * <p> * Documents will always get saved to either the original file or to a file * in the outDocumentURL directory whenever the document is synced or unloaded. * <p> * NOTE: If you use the "Save as XML" option from the LR's context menu, be * careful not specify the directory where the corpus saves documents as * the target directory for the "Save as XML" function -- this might produce * unexpected results. Even if a different directory is specified, the * "Save as XML" function will still also re-save the documents in the * corpus directory unless the <code>saveDocuments</code> option is set to * false. * * @author Johann Petrak */ @CreoleResource(name = "DirectoryCorpus", interfaceName = "gate.Corpus", icon = "corpus", helpURL = "http://code.google.com/p/gateplugin-virtualcorpus/wiki/DirectoryCorpusUsage", comment = "A corpus backed by GATE documents in a directory or directory tree") public class DirectoryCorpus extends VirtualCorpus { //***** // Fields //****** /** * */ private static final long serialVersionUID = -8485161260415382902L; protected File backingDirectoryFile; //*************** // Parameters //*************** /** * Setter for the <code>directoryURL</code> LR initialization parameter. * @param dirURL The URL of the directory where the files for the corpus will * be read * from. If the <code>outDirectoryURL</code> is left empty the documents * will be written back to the original files in this directory when * unloaded (except when <code>saveDocuments</code> is set to false). */ @CreoleParameter(comment = "The directory URL where files will be read from") public void setDirectoryURL(URL dirURL) { this.directoryURL = dirURL; } /** * Getter for the <code>directoryURL</code> LR initialization parameter. * * @return The directory URL where files are read from and (and saved to * if unloaded when outDirectoryURL is not specified and saveDocuments * is true). */ public URL getDirectoryURL() { return this.directoryURL; } protected URL directoryURL = null; /** * File extensions to use for loading document. * If this is not empty, then only files with that extension will be visible * in the corpus. If it is left empty, the file extensions supported by * the currently loaded document formats will be visible. * Note that in both cases, any extension which does not have a document * exporter which supports that extension is ignored. * The PR will check for each extension at init time, for which of those * there is a registered document exporter for saving and will only use * that exporter for any saving. * * @param extensions */ @Optional @CreoleParameter(comment = "A list of file extensions which will be loaded into the corpus. If not specified, all supported file extensions. ") public void setExtensions(List<String> extensions) { this.extensions = extensions; } public List<String> getExtensions() { return extensions; } protected List<String> extensions; @Optional @CreoleParameter(comment = "Recursively get files from the directory (default: false)", defaultValue = "false") public void setRecurseDirectory(Boolean value) { this.recurseDirectory = value; } public Boolean getRecurseDirectory() { return recurseDirectory; } protected Boolean recurseDirectory; Map<String, DocumentExporter> extension2Exporter = new HashMap<String, DocumentExporter>(); private static final Logger logger = Logger.getLogger(DirectoryCorpus.class); /** * Initializes the DirectoryCorpus LR * @return * @throws ResourceInstantiationException */ @Override public Resource init() throws ResourceInstantiationException { logger.info("DirectoryCorpus: calling init"); if (directoryURL == null) { throw new ResourceInstantiationException("directoryURL must be set"); } // first of all, create a map that contains all the supported extensions // as keys and the corresponding documente exporter as value. // First, get all the supported extensions for reading files Set<String> readExtensions = DocumentFormat.getSupportedFileSuffixes(); logger.info("DirectoryCorpus/init readExtensions=" + readExtensions); Set<String> supportedExtensions = new HashSet<String>(); // if we also want to write, we have to limit the supported extensions // to those where we have an exporter and also we need to remember which // exporter supports which extensions if (!getReadonly()) { List<Resource> des = null; try { // Now get all the Document exporters des = Gate.getCreoleRegister().getAllInstances("gate.DocumentExporter"); } catch (GateException ex) { throw new ResourceInstantiationException("Could not get the document exporters", ex); } for (Resource r : des) { DocumentExporter d = (DocumentExporter) r; if (readExtensions.contains(d.getDefaultExtension())) { extension2Exporter.put(d.getDefaultExtension(), d); supportedExtensions.add(d.getDefaultExtension()); } } } else { supportedExtensions.addAll(readExtensions); } logger.info("DirectoryCorpus/init supportedExtensions=" + readExtensions); // now check if an extension list was specified by the user. If no, nothing // needs to be done. If yes, remove all the extensions from the extnesion2Exporter // map which were not specified and warn about all the extensions specified // for which we do not have an entry. Also remove them from the supportedExtensions set if (getExtensions() != null && !getExtensions().isEmpty()) { logger.info("DirectoryCorpu/init getExtgension is not empty: " + getExtensions()); for (String ext : getExtensions()) { if (!supportedExtensions.contains(ext)) { logger.warn("DirectoryCorpus warning: extension is not supported: " + ext); } } // now remove all the extensions which are not specified Iterator<String> it = supportedExtensions.iterator(); while (it.hasNext()) { String ext = it.next(); logger.info("DirectoryCorpus/init checking supported extension: " + ext); if (!getExtensions().contains(ext)) { logger.info("DirectoryCorpus/init removing extension: " + ext); it.remove(); extension2Exporter.remove(ext); } } } logger.info("DirectoryCorpus/init supportedExtensions after parms: " + supportedExtensions); logger.info("DirectoryCorpus/init exporter map: " + extension2Exporter); if (supportedExtensions.isEmpty()) { throw new ResourceInstantiationException( "DirectoryCorpus could not be created, no file format supported or loaded"); } backingDirectoryFile = Files.fileFromURL(directoryURL); try { backingDirectoryFile = backingDirectoryFile.getCanonicalFile(); } catch (IOException ex) { throw new ResourceInstantiationException("Cannot get canonical file for " + backingDirectoryFile, ex); } if (!backingDirectoryFile.isDirectory()) { throw new ResourceInstantiationException("Not a directory " + backingDirectoryFile); } try { ourDS = (DummyDataStore4DirCorp) Factory.createDataStore( "at.ofai.gate.virtualcorpus.DummyDataStore4DirCorp", backingDirectoryFile.getAbsoluteFile().toURI().toURL().toString()); ourDS.setName("DummyDS4_" + this.getName()); ourDS.setComment("Dummy DataStore for DirectoryCorpus " + this.getName()); ourDS.setCorpus(this); //System.err.println("Created dummy corpus: "+ourDS+" with name "+ourDS.getName()); } catch (Exception ex) { throw new ResourceInstantiationException("Could not create dummy data store", ex); } logger.info("DirectoryCorpus/init: ds created: " + ourDS.getName()); Iterator<File> fileIt = FileUtils.iterateFiles(backingDirectoryFile, supportedExtensions.toArray(new String[0]), getRecurseDirectory()); int i = 0; while (fileIt.hasNext()) { File file = fileIt.next(); // if recursion was specified, we need to get the relative file path // relative to the root directory. This is done by getting the canonical // full path name for both the directory and the file and then // relativizing the path. String filename = file.getName(); // TODO: first check if this file should be ignored (hidden files?) if (!filename.startsWith(".")) { if (getRecurseDirectory()) { try { file = file.getCanonicalFile(); } catch (IOException ex) { throw new ResourceInstantiationException("Could not get canonical path for " + file); } filename = backingDirectoryFile.toURI().relativize(file.toURI()).getPath(); } documentNames.add(filename); isLoadeds.add(false); documentIndexes.put(filename, i); i++; } } if (i == 0) { logger.warn("DirectoryCorpus warning: empty immutable corpus created, no files found"); } try { PersistenceManager.registerPersistentEquivalent(at.ofai.gate.virtualcorpus.DirectoryCorpus.class, at.ofai.gate.virtualcorpus.DirectoryCorpusPersistence.class); } catch (PersistenceException e) { throw new ResourceInstantiationException("Could not register persistence", e); } Gate.getCreoleRegister().addCreoleListener(this); return this; } @Override public void cleanup() { // TODO: // deregister our listener for resources of type document // Gate.getDataStoreRegister().remove(ourDS); } // Methods to be implemented from List /** * Add a document to the corpus. If the document has a name that is already * in the list of documents, return false and do not add the document. * Note that only the name is checked! * If the name of the document added is not ending in ".xml", a * GateRuntimeException is thrown. * If the document is already adopted by some data store throw an exception. * IMPORTANT: this is NOT IMPLEMENTED at the moment! */ @Override public boolean add(Document doc) { throw new MethodNotImplementedException(notImplementedMessage("add(Document doc)")); } /* public boolean add(Document doc) { if(!saveDocuments) { return false; } //System.out.println("DocCorp: called add(Object): "+doc.getName()); String docName = doc.getName(); ensureValidDocumentName(docName,true); Integer index = documentIndexes.get(docName); if(index != null) { return false; // if that name is already in the corpus, do not add } else { // for now, we do not allow any document to be added that is already // adopted by a datastore. if(doc.getDataStore() != null) { throw new GateRuntimeException("Cannot add "+doc.getName()+" which belongs to datastore "+doc.getDataStore().getName()); } saveDocument(doc); int i = documentNames.size(); documentNames.add(docName); documentIndexes.put(docName, i); isLoadeds.add(false); if(!isTransientCorpus) { adoptDocument(doc); } fireDocumentAdded(new CorpusEvent( this, doc, i, CorpusEvent.DOCUMENT_ADDED)); return true; } } */ /** * This removes all documents from the corpus. Note that this does nothing * when the saveDocuments parameter is set to false. * If the outDirectoryURL parameter was set, this method will throw * a GateRuntimeException. * IMPORTANT: this is not implemented at the moment!! */ @Override public void clear() { throw new MethodNotImplementedException(notImplementedMessage("clear()")); } /* public void clear() { if(!saveDocuments) { return; } if(outDirectoryURL != null) { throw new GateRuntimeException( "clear method not supported when outDirectoryURL is set for "+ this.getName()); } for(int i=documentNames.size()-1; i>=0; i--) { remove(i); } } */ /** * This checks if a document with the same name as the document * passed is already in the corpus. * IMPORTANT: The content is not considered * for this, only the name is relevant! */ @Override public boolean contains(Object docObj) { Document doc = (Document) docObj; String docName = doc.getName(); return (documentIndexes.get(docName) != null); } /** * Return the document for the given index in the corpus. * An IndexOutOfBoundsException is thrown when the index is not contained * in the corpus. * The document will be read from the file only if it is not already loaded. * If it is already loaded a reference to that document is returned. * * @param index * @return */ @Override public Document get(int index) { //System.out.println("DirCorp: called get(index): "+index); if (index < 0 || index >= documentNames.size()) { throw new IndexOutOfBoundsException( "Index " + index + " not in corpus " + this.getName() + " of size " + documentNames.size()); } String docName = documentNames.get(index); if (isDocumentLoaded(index)) { Document doc = loadedDocuments.get(docName); //System.out.println("Returning loaded document "+doc); return doc; } //System.out.println("Document not loaded, reading"); Document doc = readDocument(docName); loadedDocuments.put(docName, doc); isLoadeds.set(index, true); adoptDocument(doc); return doc; } /** * Returns the index of the document with the same name as the given document * in the corpus. The content of the document is not considered for this. * * @param docObj * @return */ @Override public int indexOf(Object docObj) { Document doc = (Document) docObj; String docName = doc.getName(); Integer index = documentIndexes.get(docName); if (index == null) { return -1; } else { return index; } } /** * Returns an iterator to iterate through the documents of the * corpus. The iterator does not allow modification of the corpus. * * @return */ @Override public Iterator<Document> iterator() { return new DirectoryCorpusIterator(); } /** * Removes the document with the given index from the corpus. This is not * supported and throws a GateRuntimeException if the outDirectoryURL * was specified for this corpus. If the saveDocuments parameter is false * for this corpus, this method does nothing. * A document which is removed from the corpus will have its dummy * datastore removed and look like a transient document again. * * IMPORTANT: this is NOT IMPLEMENTED yet! * * @param index * @return the document that was just removed from the corpus */ @Override public Document remove(int index) { throw new MethodNotImplementedException(notImplementedMessage("remove(int index)")); } /* public Document remove(int index) { Document doc = (Document)get(index); String docName = documentNames.get(index); documentNames.remove(index); if(isLoadeds.get(index)) { loadedDocuments.remove(docName); } isLoadeds.remove(index); documentIndexes.remove(docName); removeDocument(docName); if (!isTransientCorpus) { try { doc.setDataStore(null); } catch (PersistenceException ex) { // this should never happen } } fireDocumentRemoved(new CorpusEvent( this, doc, index, CorpusEvent.DOCUMENT_REMOVED)); return doc; } */ /** * Removes a document with the same name as the given document * from the corpus. This is not * supported and throws a GateRuntimeException if the outDirectoryURL * was specified for this corpus. If the saveDocuments parameter is false * for this corpus, this method does nothing and always returns false. * If the a document with the same name as the given document is not * found int the corpus, this does nothing and returns false. * * @param docObj * @return true if a document was removed from the corpus */ @Override public boolean remove(Object docObj) { throw new MethodNotImplementedException(notImplementedMessage("remove(Object docObj)")); } /* public boolean remove(Object docObj) { int index = indexOf(docObj); if(index == -1) { return false; } String docName = documentNames.get(index); documentNames.remove(index); isLoadeds.remove(index); documentIndexes.remove(docName); removeDocument(docName); Document doc = isDocumentLoaded(index) ? (Document)get(index) : null; if (!isTransientCorpus) { try { doc.setDataStore(null); } catch (PersistenceException ex) { // this should never happen } } fireDocumentRemoved(new CorpusEvent( this, doc, index, CorpusEvent.DOCUMENT_REMOVED)); return true; } */ /** * Remove all the documents in the collection from the corpus. * * @param coll * @return true if any document was removed */ @Override public boolean removeAll(Collection coll) { throw new MethodNotImplementedException(notImplementedMessage("removeAll(Collection coll)")); } /* public boolean removeAll(Collection coll) { boolean ret = false; for(Object docObj : coll) { ret = ret || remove(docObj); } return ret; } */ /** * This method is not implemented and always throws a * MethodNotImplementedException. * * @param index * @param obj * @return */ @Override public Document set(int index, Document obj) { throw new gate.util.MethodNotImplementedException(notImplementedMessage("set(int,Object)")); } @Override public int size() { return documentNames.size(); } /** * This method is not implemented and always throws a * MethodNotImplementedException. * * @param i1 * @param i2 * @return */ @Override public List<Document> subList(int i1, int i2) { throw new gate.util.MethodNotImplementedException(notImplementedMessage("subList(int,int)")); } //************************** // helper methods // ************************ // This method should only get called by the datastore when a document // is synced. This will happen automatically when a document is unloaded // or when a document is deliberately synced via its datastore. @Override protected void saveDocument(Document doc) { //System.out.println("DirCorp: save doc "+doc.getName()); // If the corpus is read-only, nothing gets saved if (getReadonly()) { return; } String docName = doc.getName(); // get the extension and then look up the document exporter for that // extension which will be used to do the actual saving. int extDotPos = docName.lastIndexOf("."); if (extDotPos <= 0) { throw new GateRuntimeException( "Did not find a file name extensions when trying to save document " + docName); } String ext = docName.substring(extDotPos + 1); if (ext.isEmpty()) { throw new GateRuntimeException("Encountered empty extension when trying to save document " + docName); } DocumentExporter de = extension2Exporter.get(ext); logger.info("DirectoryCorpus/saveDocument exit is " + ext + " exporter " + de); File docFile = new File(backingDirectoryFile, docName); try { logger.info("DirectoryCorpus/saveDocument trying to save document " + doc.getName() + " using exporter " + de); de.export(doc, docFile); logger.info("DirectoryCorpus/saveDocument saved: " + doc.getName()); } catch (IOException ex) { throw new GateRuntimeException("Could not save file: " + docFile, ex); } } protected Document readDocument(String docName) { //System.out.println("DirCorp: read doc "+docName); File docFile = new File(backingDirectoryFile, docName); URL docURL; Document doc = null; try { docURL = docFile.toURI().toURL(); } catch (MalformedURLException ex) { throw new GateRuntimeException("Could not create URL for document name " + docName, ex); } FeatureMap params = Factory.newFeatureMap(); params.put(Document.DOCUMENT_URL_PARAMETER_NAME, docURL); try { doc = (Document) Factory.createResource(DocumentImpl.class.getName(), params, null, docName); } catch (ResourceInstantiationException ex) { throw new GateRuntimeException("Could not create Document from file " + docFile, ex); } return doc; } // NOTE: not used at the moment, our corpus is always immutable so far! /* protected void removeDocument(String docName) { File docFile = new File(backingDirectoryFile, docName); docFile.delete(); } */ protected void adoptDocument(Document doc) { try { doc.setDataStore(ourDS); //System.err.println("Adopted document "+doc.getName()); } catch (PersistenceException ex) { //System.err.println("Got exception when adopting: "+ex); } } protected class DirectoryCorpusIterator implements Iterator<Document> { int nextIndex = 0; @Override public boolean hasNext() { return (documentNames.size() > nextIndex); } @Override public Document next() { if (hasNext()) { return get(nextIndex++); } else { return null; } } @Override public void remove() { throw new MethodNotImplementedException(); } } } // class DirectoryCorpus