Java tutorial
/** * Copyright 2009 Humboldt-Universitt zu Berlin, INRIA. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * */ package org.corpus_tools.pepper.core; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.Vector; import java.util.concurrent.CancellationException; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.corpus_tools.pepper.common.CorpusDesc; import org.corpus_tools.pepper.common.DOCUMENT_STATUS; import org.corpus_tools.pepper.common.JOB_STATUS; import org.corpus_tools.pepper.common.MEMORY_POLICY; import org.corpus_tools.pepper.common.MODULE_TYPE; import org.corpus_tools.pepper.common.PepperConfiguration; import org.corpus_tools.pepper.common.PepperJob; import org.corpus_tools.pepper.common.PepperUtil; import org.corpus_tools.pepper.common.StepDesc; import org.corpus_tools.pepper.exceptions.NotInitializedException; import org.corpus_tools.pepper.exceptions.PepperException; import org.corpus_tools.pepper.exceptions.PepperFWException; import org.corpus_tools.pepper.exceptions.PepperInActionException; import org.corpus_tools.pepper.exceptions.WorkflowException; import org.corpus_tools.pepper.modules.DocumentController; import org.corpus_tools.pepper.modules.PepperExporter; import org.corpus_tools.pepper.modules.PepperImporter; import org.corpus_tools.pepper.modules.PepperModule; import org.corpus_tools.pepper.modules.PepperModuleProperty; import org.corpus_tools.pepper.modules.coreModules.DoNothingExporter; import org.corpus_tools.pepper.modules.coreModules.DoNothingImporter; import org.corpus_tools.pepper.modules.exceptions.PepperModuleException; import org.corpus_tools.pepper.modules.exceptions.PepperModuleXMLResourceException; import org.corpus_tools.pepper.util.XMLStreamWriter; import org.corpus_tools.salt.SaltFactory; import org.corpus_tools.salt.common.SCorpusGraph; import org.corpus_tools.salt.common.SDocument; import org.corpus_tools.salt.common.SDocumentGraph; import org.corpus_tools.salt.common.SaltProject; import org.corpus_tools.salt.graph.Identifier; import org.corpus_tools.salt.util.SaltUtil; import org.eclipse.emf.common.util.URI; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; /** * This class represents a single, but entire conversion process in Pepper, * containing the entire workflow like import, manipulation and export. <br/> * This object contains a list of single steps. Each of such steps represent one * task in processing by a specific module. Such a step is represented by the * {@link Step} class. To get the list of all steps, see {@link #getSteps()}. * <strong>Note: Do not use this list to add a further step. Use * {@link #addStep(PepperModule)} instead.</strong> * * <h2>Order of tasks to be done for start of processing</h2> If you do not * follow this order in calling, and for instance call task 3 fist, than task 1 * and task 2 will be done automatically. * <ul> * <li>wiring of {@link ModuleControllerImpl}, {@link DocumentBus} and * {@link PepperJobImpl}</li> * <li>import of corpus structure</li> * <li>start</li> * </ul> * * @author Florian Zipser * */ public class PepperJobImpl extends PepperJob { private static final Logger logger = LoggerFactory.getLogger(PepperJobImpl.class); /** * Initializes a {@link PepperJobImpl} and sets its unique identifier. * * @param jobId * unique identifier for this job. The id is not changeable */ public PepperJobImpl(String jobId) { if ((jobId == null) || (jobId.isEmpty())) { throw new PepperFWException("Cannot initialize a PepperJob with an empty id."); } id = jobId; setSaltProject(SaltFactory.createSaltProject()); } /** The {@link SaltProject} which is converted by this job. **/ protected SaltProject saltProject = null; /** {@inheritDoc} **/ @Override public SaltProject getSaltProject() { return saltProject; } /** * Sets the {@link SaltProject} which is converted by this job. * * @param saltProject * new {@link SaltProject} */ public void setSaltProject(SaltProject saltProject) { if (inProgress.isLocked()) { throw new PepperInActionException( "Cannot set a new salt project to job '" + getId() + "', since this job was already started."); } this.saltProject = saltProject; } /** * properties to customize the behavior of conversion for this single job **/ private PepperConfiguration props = null; /** * returns the properties to customize the behavior of conversion for this * single job. * * @return given customization properties */ public PepperConfiguration getConfiguration() { if (props == null) { props = new PepperConfiguration(); } return props; } /** * Sets the properties to customize the behavior of conversion for this * single job * * @param conf * for customization */ public void setConfiguration(PepperConfiguration conf) { if (inProgress.isLocked()) { throw new PepperInActionException( "Cannot set a new configuration to job '" + getId() + "', since this job was already started."); } this.props = conf; setMemPolicy(getConfiguration().getMemPolicy()); setMaxNumerOfDocuments(getConfiguration().getMaxAmountOfDocuments()); } /** * A reference to the OSGi module resolver, to find modules matching to the * step description **/ protected ModuleResolver moduleResolver = null; /** * Returns a reference to the OSGi module resolver, to find modules matching * to the step description. * * @return reference to resolver to resolve {@link PepperModule} objects */ public ModuleResolver getModuleResolver() { return moduleResolver; } /** * Sets a reference to the OSGi module resolver, to find modules matching to * the step description. * * @param moduleResolver * reference to resolver to resolve {@link PepperModule} objects */ public void setModuleResolver(ModuleResolver moduleResolver) { if (inProgress.isLocked()) { throw new PepperInActionException("Cannot set a new module resolver to job '" + getId() + "', since this job was already started."); } this.moduleResolver = moduleResolver; } /** Internal list of all steps belonging to the manipulation phase. **/ private List<Step> manipulationSteps = null; /** * Returns the list of all steps belonging to the manipulation phase. * * @return list of {@link Step} objects. **/ public List<Step> getManipulationSteps() { if (manipulationSteps == null) { synchronized (this) { if (manipulationSteps == null) { manipulationSteps = new Vector<Step>(); } } } return (manipulationSteps); } /** Internal list of all steps belonging to the import phase. **/ private List<Step> importSteps = null; /** * Returns the list of all steps belonging to the import phase. * * @return list of {@link Step} objects. **/ public List<Step> getImportSteps() { if (importSteps == null) { synchronized (this) { if (importSteps == null) { importSteps = new Vector<Step>(); } } } return (importSteps); } /** Internal list of all steps belonging to the export phase. **/ private List<Step> exportSteps = null; /** * Returns the list of all steps belonging to the export phase. * * @return list of {@link Step} objects. **/ public List<Step> getExportSteps() { if (exportSteps == null) { synchronized (this) { if (exportSteps == null) { exportSteps = new Vector<Step>(); } } } return (exportSteps); } /** * Returns a of all steps belonging no matter, to which phase they belong. * <br/> * <strong>This computation could be expensive, when working more than once * with the list, make a local copy and don't call this method * twice.</strong> * * @return list of {@link Step} objects. **/ public List<Step> getAllSteps() { List<Step> allSteps = new Vector<Step>(); if (getImportSteps() != null) { allSteps.addAll(getImportSteps()); } if (getManipulationSteps() != null) { allSteps.addAll(getManipulationSteps()); } if (getExportSteps() != null) { allSteps.addAll(getExportSteps()); } return (allSteps); } /** * Overrides method {@link PepperJob#addStepDesc(StepDesc)}, but calls it * via super and than calls {@link #addStep(Step)}.<br/> * {@inheritDoc PepperJob#addStepDesc(StepDesc)} */ @Override public synchronized void addStepDesc(StepDesc stepDesc) { super.addStepDesc(stepDesc); addStep(stepDesc); } /** * Creates a new {@link Step} object containing all values of the passed * {@link StepDesc} object and adds it to the workflow covered by this * {@link PepperJobImpl}. Further tries to resolve the described * {@link PepperModule}. * * @param stepDesc * {@link StepDesc} object to be added to internal list * @return the created {@link Step} object */ public synchronized Step addStep(StepDesc stepDesc) { if (inProgress.isLocked()) { throw new PepperInActionException("Cannot add a new step description to job '" + getId() + "', since this job was already started."); } if (stepDesc == null) { throw new WorkflowException("Cannot deal with an empty StepDesc object for job '" + getId() + "'."); } if (getModuleResolver() == null) { throw new PepperFWException("Cannot add step '" + stepDesc + "', because no module resolver is set."); } Step step = null; if (MODULE_TYPE.MANIPULATOR.equals(stepDesc.getModuleType())) { step = new Step("ma" + (getManipulationSteps().size() + 1), stepDesc); } else if (MODULE_TYPE.IMPORTER.equals(stepDesc.getModuleType())) { step = new Step("im" + (getImportSteps().size() + 1), stepDesc); } else if (MODULE_TYPE.EXPORTER.equals(stepDesc.getModuleType())) { step = new Step("ex" + (getExportSteps().size() + 1), stepDesc); } else { throw new WorkflowException("Cannot add step description, because the 'MODULE_TYPE' is not set."); } addStep(step); return (step); } /** * Adds the passed {@link Step} object to the workflow covered by this * {@link PepperJobImpl} object and tries to resolve the described * {@link PepperModule}. * <h2>Prerequisite</h2> * <ul> * <li>{@link #getModuleResolver()} must be set</li> * <li>{@link #getSaltProject()} must be set</li> * </ul> * * @param step * {@link Step} object to be added to internal list */ public synchronized void addStep(Step step) { if (inProgress.isLocked()) { throw new PepperInActionException( "Cannot add a new step to job '" + getId() + "', since this job was already started."); } if (step == null) { throw new WorkflowException("Cannot deal with an empty step object for job '" + getId() + "'."); } if ((step.getModuleController() == null) || (step.getModuleController().getPepperModule() == null)) { if (getModuleResolver() == null) { throw new PepperFWException("Cannot add the given step '" + step.getId() + "', because it does not contain a module controller and the module resolver for this job '" + getId() + "' is not set. So the Pepper module can not be estimated."); } if (getSaltProject() == null) { throw new PepperFWException("Cannot add a step '" + step.getId() + "', since no '" + SaltProject.class.getSimpleName() + "' is set for job '" + getId() + "'."); } PepperModule pepperModule = getModuleResolver().getPepperModule(step); if (pepperModule == null) { throw new WorkflowException( "No Pepper module matching to step '" + step.getId() + "' was found: " + step); } pepperModule.setSaltProject(getSaltProject()); step.setPepperModule(pepperModule); } step.getModuleController().setJob(this); if (MODULE_TYPE.MANIPULATOR.equals(step.getModuleType())) { getManipulationSteps().add(step); } else if (MODULE_TYPE.IMPORTER.equals(step.getModuleType())) { getImportSteps().add(step); } else if (MODULE_TYPE.EXPORTER.equals(step.getModuleType())) { getExportSteps().add(step); } } /** A list of all buses between the {@link ModuleControllerImpl} objects **/ protected List<DocumentBus> documentBuses = null; protected List<DocumentBus> getDocumentBuses() { if (documentBuses == null) { synchronized (this) { if (documentBuses == null) documentBuses = new Vector<DocumentBus>(); } } return (documentBuses); } /** * all documentBusses which are connected with {@link PepperImporter} * modules **/ protected List<DocumentBus> initialDocumentBuses = null; /** * Determines, if steps are already wired. This is necessary for * {@link #start()}. **/ protected boolean isWired = false; /** * Wires all {@link Step} objects being contained by this object to be ready * for {@link #start()}. Which means, that: * <ol> * <li>each step of {@link #getImportSteps()} is wired via a initial * document bus with the framework (no wiring, but listening on * {@link #ID_INTITIAL})</li> * <li>each step of {@link #getManipulationSteps()} is wired with following * step in list with a {@link DocumentBus}. * <li/> * <li>each step of {@link #getExportSteps()} is wired via a terminal * document bus with the framework (no wiring, but listening on * {@link #ID_TERMINAL})</li> * </ol> */ protected synchronized void wire() { if (getImportSteps().size() == 0) { throw new NotInitializedException("Cannot wire job '" + this + "', since no import steps were given."); } if (getExportSteps().size() == 0) { throw new NotInitializedException("Cannot wire job '" + this + "', since no export steps were given."); } // compute all ids of import steps List<String> importStepIds = new Vector<String>(); for (Step importStep : getImportSteps()) { importStepIds.add(importStep.getModuleController().getId()); } // compute all ids of export steps List<String> exportStepIds = new Vector<String>(); for (Step exportStep : getExportSteps()) { exportStepIds.add(exportStep.getModuleController().getId()); } initialDocumentBuses = new Vector<DocumentBus>(); for (Step importStep : getImportSteps()) { DocumentBus initialDocumentBus = new InitialDocumentBus(importStep.getModuleController().getId()); initialDocumentBus.setPepperJob(this); initialDocumentBus.setMemPolicy(getMemPolicy()); importStep.getModuleController().setInputDocumentBus(initialDocumentBus); initialDocumentBuses.add(initialDocumentBus); getDocumentBuses().add(initialDocumentBus); } // DocumentBus terminalDocumentBus= new // TerminalDocumentBus(exportStepIds, ID_TERMINAL); DocumentBus terminalDocumentBus = new TerminalDocumentBus(exportStepIds); terminalDocumentBus.setPepperJob(this); terminalDocumentBus.setMemPolicy(getMemPolicy()); getDocumentBuses().add(terminalDocumentBus); for (Step exportStep : getExportSteps()) { exportStep.getModuleController().setOutputDocumentBus(terminalDocumentBus); } // if there are manipulation steps to be done if (0 < getManipulationSteps().size()) { // connect all import steps with first manipulation step Step firstManipulationStep = getManipulationSteps().get(0); if (firstManipulationStep == null) { throw new PepperFWException("The first step in list of manipulation steps is null."); } // compute all ids of first manipulation step List<String> firstManipulationStepIds = new Vector<String>(); firstManipulationStepIds.add(firstManipulationStep.getModuleController().getId()); DocumentBus firstDocumentBus = new DocumentBus(importStepIds, firstManipulationStepIds); firstDocumentBus.setPepperJob(this); firstDocumentBus.setMemPolicy(getMemPolicy()); getDocumentBuses().add(firstDocumentBus); for (Step importStep : getImportSteps()) { importStep.getModuleController().setOutputDocumentBus(firstDocumentBus); } firstManipulationStep.getModuleController().setInputDocumentBus(firstDocumentBus); // connect all manipulation steps if (getManipulationSteps().size() > 1) { Step lastStep = null; for (Step manipulationStep : getManipulationSteps()) { if (manipulationStep == null) throw new PepperFWException("A manipulation step is null."); if (lastStep != null) { DocumentBus documentBus = new DocumentBus(lastStep.getModuleController().getId(), manipulationStep.getModuleController().getId()); documentBus.setPepperJob(this); documentBus.setMemPolicy(getMemPolicy()); getDocumentBuses().add(documentBus); lastStep.getModuleController().setOutputDocumentBus(documentBus); manipulationStep.getModuleController().setInputDocumentBus(documentBus); } lastStep = manipulationStep; } } Step lastManipulationStep = getManipulationSteps().get(getManipulationSteps().size() - 1); if (lastManipulationStep == null) { throw new PepperFWException("The last step in list of manipulation steps is null."); } // compute all ids of first manipulation step List<String> lastManipulationStepIds = new Vector<String>(); lastManipulationStepIds.add(lastManipulationStep.getModuleController().getId()); DocumentBus lastDocumentBus = new DocumentBus(lastManipulationStepIds, exportStepIds); lastDocumentBus.setPepperJob(this); lastDocumentBus.setMemPolicy(getMemPolicy()); getDocumentBuses().add(lastDocumentBus); lastManipulationStep.getModuleController().setOutputDocumentBus(lastDocumentBus); for (Step exportStep : getExportSteps()) { exportStep.getModuleController().setInputDocumentBus(lastDocumentBus); } } else {// direct connect importers and exporters with one central bus DocumentBus centralBus = new DocumentBus(importStepIds, exportStepIds); centralBus.setPepperJob(this); centralBus.setMemPolicy(getMemPolicy()); getDocumentBuses().add(centralBus); for (Step importStep : getImportSteps()) { importStep.getModuleController().setOutputDocumentBus(centralBus); } for (Step exportStep : getExportSteps()) { exportStep.getModuleController().setInputDocumentBus(centralBus); } } isWired = true; } /** flag to determine, if corpus structure has already been imported **/ protected volatile boolean isImportedCorpusStructure = false; /** * Imports corpus structures of all registered * {@link ImportCorpusStructureTest} steps. After calling * {@link PepperImporter#importCorpusStructure(SCorpusGraph)} , all * following modules will be asked, if they want to influence the order of * importing documents. If this is the case, an order is created and put to * all {@link PepperImporter} objects. <br/> * This method produces as much as {@link SCorpusGraph} objects as * {@link Step} given in import step list {@link #getImportSteps()}. The * position of {@link SCorpusGraph} corresponding to {@link PepperImporter} * (importing that graph) in {@link SaltProject#getCorpusGraphs()} is * equivalent to position of {@link Step} in list {@link #getImportSteps()}. */ protected synchronized void importCorpusStructures() { try { if (!isWired) { wire(); } List<Future<?>> futures = new Vector<Future<?>>(); int numOfImportStep = 0; for (Step importStep : getImportSteps()) { if (getSaltProject() == null) { throw new PepperFWException("Cannot import corpus structure, because no salt project is set."); } SCorpusGraph sCorpusGraph = null; if ((getSaltProject().getCorpusGraphs().size() > numOfImportStep) && (getSaltProject().getCorpusGraphs().get(numOfImportStep) != null)) { sCorpusGraph = getSaltProject().getCorpusGraphs().get(numOfImportStep); } else { sCorpusGraph = SaltFactory.createSCorpusGraph(); getSaltProject().addCorpusGraph(sCorpusGraph); } futures.add(importStep.getModuleController().importCorpusStructure(sCorpusGraph)); numOfImportStep++; } for (Future<?> future : futures) { // wait until all corpus structures have been imported try { future.get(); } catch (ExecutionException e) { throw new PepperModuleException("Failed to import corpus by module. Nested exception was: ", e.getCause()); } catch (InterruptedException e) { throw new PepperFWException("Failed to import corpus by module. Nested exception was: ", e.getCause()); } catch (CancellationException e) { throw new PepperFWException("Failed to import corpus by module. Nested exception was: ", e.getCause()); } } int i = 0; for (Step step : getImportSteps()) { if (getSaltProject().getCorpusGraphs().get(i) == null) { throw new PepperModuleException("The importer '" + step.getModuleController().getPepperModule() + "' did not import a corpus structure."); } // handle proposed import order List<Identifier> importOrder = unifyProposedImportOrders(getSaltProject().getCorpusGraphs().get(i)); for (Identifier sDocumentId : importOrder) { DocumentControllerImpl documentController = new DocumentControllerImpl(); SDocument sDoc = (SDocument) sDocumentId.getIdentifiableElement(); if (sDoc.getDocumentGraph() == null) { sDoc.setDocumentGraph(SaltFactory.createSDocumentGraph()); } documentController.setDocument(sDoc); // sets flag to determine whether garbage collector should // be called after document was send to sleep if (getConfiguration() != null) { documentController.setCallGC(getConfiguration().getGcAfterDocumentSleep()); } getDocumentControllers().add(documentController); File docFile = null; String prefix = sDoc.getName(); File tmpPath = new File(getConfiguration().getWorkspace().getAbsolutePath() + "/" + getId()); if (!tmpPath.exists()) { if (!tmpPath.mkdirs()) { logger.warn("Cannot create folder {}. ", tmpPath); } } try { if (prefix.length() < 3) { prefix = prefix + "artificial"; } docFile = File.createTempFile(prefix, "." + SaltUtil.FILE_ENDING_SALT_XML, tmpPath); } catch (IOException e) { throw new PepperFWException( "Cannot store document '" + sDoc.getName() + "' to file '" + docFile + "' in folder for temporary files '" + tmpPath + "'. " + e.getMessage(), e); } documentController.setLocation(URI.createFileURI(docFile.getAbsolutePath())); if (!getConfiguration().getKeepDocuments()) { docFile.deleteOnExit(); } initialDocumentBuses.get(i).put(documentController); // notify document controller about all modules in workflow documentController.addModuleControllers(step.getModuleController()); for (Step manipulationStep : getManipulationSteps()) { documentController.addModuleControllers(manipulationStep.getModuleController()); } for (Step exportStep : getExportSteps()) { documentController.addModuleControllers(exportStep.getModuleController()); } } initialDocumentBuses.get(i).finish(InitialDocumentBus.ID_INTITIAL); i++; } isImportedCorpusStructure = true; } catch (RuntimeException e) { if (e instanceof PepperException) { throw (PepperException) e; } else { throw new PepperFWException("An exception occured in job '" + getId() + "' while importing the corpus-structure. See nested exception: ", e); } } } /** * Returns a list of {@link Identifier}s corresponding to the * {@link SDocument} objects contained in the passed {@link SCorpusGraph} * object. If all registered modules, do not make a proposal, the natural * one (the one given by the order of {@link SDocument}s in * {@link SCorpusGraph}) is taken. <strong>Note: Currently, this method does * not a real unification, if more than one proposals are given, the first * one is taken.</strong> * * @param sCorpusGraph * the {@link SCorpusGraph} for which the list has to be unified * @return unified list */ protected List<Identifier> unifyProposedImportOrders(SCorpusGraph sCorpusGraph) { List<Identifier> retVal = new Vector<Identifier>(); if (sCorpusGraph == null) { throw new PepperFWException("Cannot unify the import order, for an empty SCorpusGraph object."); } Vector<List<Identifier>> listOfOrders = new Vector<List<Identifier>>(); for (Step step : getAllSteps()) { if (step.getModuleController() == null) { throw new PepperFWException("Cannot unify proposed import orders, since step '" + step.getId() + "' does not contain a module controller."); } if (step.getModuleController().getPepperModule() == null) { throw new PepperFWException("Cannot unify proposed import orders, since module controller '" + step.getModuleController().getId() + "' does not contain a Pepper module."); } List<Identifier> importOrder = step.getModuleController().getPepperModule() .proposeImportOrder(sCorpusGraph); if ((importOrder != null) && (importOrder.size() > 0)) { if (importOrder.size() < sCorpusGraph.getDocuments().size()) { for (SDocument sDoc : sCorpusGraph.getDocuments()) { if (!importOrder.contains(sDoc.getIdentifier())) importOrder.add(sDoc.getIdentifier()); } } listOfOrders.add(importOrder); } } if (listOfOrders.size() == 0) { // if no proposals have been made, make the natural one for (SDocument sDocument : sCorpusGraph.getDocuments()) { retVal.add(sDocument.getIdentifier()); } } else if (listOfOrders.size() == 1) { retVal = listOfOrders.get(0); } else { retVal = listOfOrders.get(0); logger.warn( "Sorry the feature of unifying more than one list of proposed import orders is not yet implemented. "); // TODO do some fancy stuff for list unification } return (retVal); } /** * A list of all {@link DocumentControllerImpl} objects corresponding to * each {@link SDocument} belonging to this job. **/ protected List<DocumentController> documentControllers = null; /** * Returns a list of all {@link DocumentControllerImpl} objects * corresponding to each {@link SDocument} belonging to this job. * * @return a list of all {@link DocumentControllerImpl}s */ public List<DocumentController> getDocumentControllers() { if (documentControllers == null) { synchronized (this) { if (documentControllers == null) { documentControllers = new Vector<DocumentController>(); } } } return (documentControllers); } /** * {@inheritDoc PepperJob#getStatusReport()} */ @Override public String getStatusReport() { StringBuilder retVal = new StringBuilder(); retVal.append("--------------------------- pepper job status ---------------------------"); retVal.append("\n"); retVal.append("id:\t\t\t'"); retVal.append(getId()); retVal.append("\n"); retVal.append("active documents:\t"); retVal.append(getNumOfActiveDocuments()); retVal.append(" of "); retVal.append(getMaxNumberOfDocuments()); retVal.append("\n"); retVal.append("status:\t\t\t"); retVal.append(getStatus()); retVal.append("\n"); StringBuilder detailedStr = new StringBuilder(); double progressOverAll = 0; int numOfDocuments = 0; if (getDocumentControllers().isEmpty()) { retVal.append("- no documents found to display progress -\n"); } else { String sleep = " (sleep)"; int distance = 0; for (DocumentController docController : getDocumentControllers()) { String globalId = docController.getGlobalId(); if (distance < globalId.length()) { distance = globalId.length(); } } // distance is distance plus 4??? plus length of string 'sleep' distance = distance + 4 + sleep.length() + DOCUMENT_STATUS.IN_PROGRESS.toString().length(); StringBuilder docInfo = null; for (DocumentController docController : getDocumentControllers()) { docInfo = new StringBuilder(); numOfDocuments++; double progress = docController.getProgress(); progressOverAll = progressOverAll + progress; String progressStr = new DecimalFormat("###.##").format(progress * 100) + "%"; docInfo.append(docController.getGlobalId()); docInfo.append("("); docInfo.append(docController.getGlobalStatus()); if (docController.isAsleep()) { docInfo.append("/sleep"); } else { docInfo.append("/"); if (!DOCUMENT_STATUS.COMPLETED.equals(docController.getGlobalStatus()) && !DOCUMENT_STATUS.DELETED.equals(docController.getGlobalStatus()) && !DOCUMENT_STATUS.FAILED.equals(docController.getGlobalStatus())) { if (docController.getCurrentModuleController() == null || docController.getCurrentModuleController().getPepperModule() == null) { docInfo.append("???"); } else { docInfo.append(docController.getCurrentModuleController().getPepperModule().getName()); } } } docInfo.append(")"); detailedStr.append(String.format("%-" + distance + "s%8s", docInfo.toString(), progressStr)); detailedStr.append("\n"); } retVal.append("total progress:\t\t"); if (numOfDocuments != 0) { retVal.append(new DecimalFormat("###.##").format(progressOverAll / numOfDocuments * 100) + "%"); } retVal.append("\n"); retVal.append("processing time:\t"); retVal.append(DurationFormatUtils.formatDurationHMS(getProcessingTime())); retVal.append("\n"); if (getConfiguration().getDetaialedStatReport()) { retVal.append(detailedStr.toString()); } } retVal.append("-------------------------------------------------------------------------"); retVal.append("\n"); return (retVal.toString()); } /** Determines if {@link #checkReadyToStart()} was already called **/ protected boolean isReadyToStart = false; /** * Checks for each {@link PepperModule} in all steps, if it is ready to * start, via calling {@link PepperModule#isReadyToStart()}. * * @return a list of steps whose modules are not ready to start */ protected synchronized Collection<Pair<Step, Collection<String>>> checkReadyToStart() { ArrayList<Pair<Step, Collection<String>>> retVal = new ArrayList<>(); for (Step step : getAllSteps()) { if (!step.getModuleController().getPepperModule().isReadyToStart()) { Pair<Step, Collection<String>> stepReason = new ImmutablePair<Step, Collection<String>>(step, step.getModuleController().getPepperModule().getStartProblems()); retVal.add(stepReason); logger.error("Cannot run pepper job '" + getId() + "', because one of the involved modules '" + step.getModuleController().getPepperModule().getFingerprint() + "' is not ready to run."); } } return (retVal); } /** Stores the time when this job was started **/ private Long startTime = 0l; /** Returns the time when this job was started **/ private Long getStartTime() { return startTime; } /** * Returns the amount of time the job already took. * * @return time in milli seconds */ public Long getProcessingTime() { return System.currentTimeMillis() - startTime; } /** * Specifies if this job currently runs a conversion. If this is the case, * some other operations, like adding {@link Step}s cannot be done * simultaneously. **/ protected volatile ReentrantLock inProgress = new ReentrantLock(); /** * Starts the conversion of this job. * <ul> * <li>If the single steps of the job has not already been wired, they will * be wired. * <li> * <li>If {@link PepperImporter#importCorpusStructure(SCorpusGraph)} has not * already been called, it will be done. * <li> * </ul> */ public void convert() { if (!inProgress.tryLock()) { throw new PepperInActionException( "Cannot run convert() of job '" + getId() + "', since this job was already started."); } inProgress.lock(); try { startTime = System.currentTimeMillis(); status = JOB_STATUS.INITIALIZING; if (!isWired) { wire(); } if (!isReadyToStart) { Collection<Pair<Step, Collection<String>>> notReadyModules = checkReadyToStart(); if (notReadyModules.size() != 0) { StringBuilder str = new StringBuilder(); for (Pair<Step, Collection<String>> problems : notReadyModules) { str.append("["); str.append(problems.getLeft()); str.append(": "); str.append(problems.getRight()); str.append("], "); } throw new PepperException("Cannot run Pepper job '" + getId() + "', because at least one of the involved jobs is not ready to run: '" + str.toString() + "'. "); } } status = JOB_STATUS.IMPORTING_CORPUS_STRUCTURE; if (!isImportedCorpusStructure) { importCorpusStructures(); } status = JOB_STATUS.IMPORTING_DOCUMENT_STRUCTURE; List<Pair<ModuleControllerImpl, Future<?>>> futures = new Vector<Pair<ModuleControllerImpl, Future<?>>>(); // create a future for each step for (Step step : getAllSteps()) { if (step.getModuleController().getPepperModule().getSaltProject() == null) step.getModuleController().getPepperModule().setSaltProject(getSaltProject()); { futures.add(new ImmutablePair<ModuleControllerImpl, Future<?>>(step.getModuleController(), step.getModuleController().processDocumentStructures())); } } // log workflow information int stepNum = 0; // current number of step StringBuilder str = new StringBuilder(); for (Step step : getAllSteps()) { stepNum++; str.append("+----------------------------------- step "); str.append(stepNum); str.append(" -----------------------------------+\n"); String format = "|%-15s%-63s|\n"; str.append( String.format(format, step.getModuleType().toString().toLowerCase() + ":", step.getName())); str.append(String.format(format, "path:", step.getCorpusDesc().getCorpusPath())); if (MODULE_TYPE.IMPORTER.equals(step.getModuleType())) { int idxCorpusGraph = getSaltProject().getCorpusGraphs().indexOf( ((PepperImporter) step.getModuleController().getPepperModule()).getCorpusGraph()); str.append(String.format(format, "corpus index:", idxCorpusGraph)); } boolean hasProperties = false; StringBuilder propStr = new StringBuilder(); if (step.getModuleController().getPepperModule().getProperties() .getPropertyDesctriptions() != null) { // log all properties of all modules and their values format = "| %-25s%-38s|\n"; for (PepperModuleProperty<?> prop : step.getModuleController().getPepperModule().getProperties() .getPropertyDesctriptions()) { if (prop.getValue() != null) { hasProperties = true; propStr.append(String.format(format, prop.getName() + ":", prop.getValue())); } } } format = "|%-15s%-63s|\n"; if (hasProperties) { str.append(String.format(format, "properties:", "")); str.append(propStr.toString()); } else { str.append(String.format(format, "properties:", "- none -")); } str.append("| |\n"); } str.append("+------------------------------------------------------------------------------+\n"); logger.info(str.toString()); for (Pair<ModuleControllerImpl, Future<?>> future : futures) { // wait until all document-structures have been imported try { future.getRight().get(); } catch (ExecutionException e) { if ((e.getCause() != null) && (e.getCause() instanceof PepperException)) { throw (PepperException) e.getCause(); } throw new PepperModuleException("Failed to process document by module '" + future.getLeft() + "'. Nested exception was: ", e.getCause()); } catch (InterruptedException e) { if ((e.getCause() != null) && (e.getCause() instanceof PepperException)) { throw (PepperException) e.getCause(); } throw new PepperFWException("Failed to process document by module '" + future.getLeft() + "'. Nested exception was: ", e.getCause()); } catch (CancellationException e) { if ((e.getCause() != null) && (e.getCause() instanceof PepperException)) { throw (PepperException) e.getCause(); } throw new PepperFWException("Failed to process document by module '" + future.getLeft() + "'. Nested exception was: ", e.getCause()); } } status = JOB_STATUS.ENDED; } catch (RuntimeException e) { status = JOB_STATUS.ENDED_WITH_ERRORS; if (e instanceof PepperException) { throw (PepperException) e; } else { throw new PepperFWException( "An exception occured in job '" + getId() + "' while importing the corpus-structure. See nested exception: " + e.getMessage(), e); } } finally { inProgress.unlock(); } } /** * {@inheritDoc} */ @Override public void convertFrom() { if (getExportSteps().size() > 0) { logger.warn( "Cannot consider given export steps, any export step is ignored when invoking 'convertFrom()'. To create a conversion process with export steps use 'convert()' instead. "); exportSteps.clear(); } addStepDesc(new StepDesc().setName(DoNothingExporter.MODULE_NAME).setModuleType(MODULE_TYPE.EXPORTER) .setCorpusDesc(new CorpusDesc() .setCorpusPath(URI.createFileURI(PepperUtil.getTempFile().getAbsolutePath())))); convert(); } /** * {@inheritDoc} */ @Override public void convertTo() { if (getImportSteps().size() > 0) { logger.warn( "Cannot consider given import steps, any import step is ignored when invoking 'convertTo()'. To create a conversion process with import steps use 'convert()' instead. "); importSteps.clear(); } addStepDesc(new StepDesc().setName(DoNothingImporter.MODULE_NAME).setModuleType(MODULE_TYPE.IMPORTER) .setCorpusDesc(new CorpusDesc() .setCorpusPath(URI.createFileURI(PepperUtil.getTempFile().getAbsolutePath())))); convert(); } // ======================================= start: managing number of active // documents protected volatile MEMORY_POLICY memPolicy = MEMORY_POLICY.MODERATE; /** * Returns the set memory policy. * * @return */ public MEMORY_POLICY getMemPolicy() { return memPolicy; } /** * Sets the memory policy. Don't change the memory policy, when job was * started. * * @param memPolicy */ protected void setMemPolicy(MEMORY_POLICY memPolicy) { this.memPolicy = memPolicy; } /** * Determines the maximal number of {@link SDocument} objects which could be * processed at the same time **/ private volatile int maxNumOfDocuments = 10; protected void setMaxNumerOfDocuments(int maxNumOfDocuments) { this.maxNumOfDocuments = maxNumOfDocuments; } /** * Returns the maximal number of {@link SDocument} objects which could be * processed at the same time * * @return number of documents */ public int getMaxNumberOfDocuments() { return (maxNumOfDocuments); } /** * Returns the current number of {@link SDocument} objects which could be * processed at the same time * * @return number of documents */ public int getNumOfActiveDocuments() { return (getActiveDocuments().size()); } /** lock for correct change of {@link #currNumOfDocuments} **/ private volatile Lock numOfDocsLock = new ReentrantLock(); /** * condition for notifying {@link #getPermissionForProcessDoument()} when a * document was released via {@link #releaseDocument()} **/ private volatile Condition numOfDocsCondition = numOfDocsLock.newCondition(); /** A set of all currently active documents. **/ private Set<DocumentController> activeDocuments = null; /** Returns a set of all currently active documents. **/ public Set<DocumentController> getActiveDocuments() { if (activeDocuments == null) { activeDocuments = new HashSet<>(); } return (activeDocuments); } /** * Returns true, if a {@link SDocument} or more precisely spoken a * {@link SDocumentGraph} could be woken up or imported. This is the case, * as long as: <br/> * {@link #getNumOfActiveDocuments()} < {@link #getMaxNumberOfDocuments()}. * <br/> * Must be synchronized, * * @return true, when #getCurrNumberOfDocuments()} < * {@link #getMaxNumberOfDocuments(), false otherwise */ public boolean getPermissionForProcessDoument(DocumentController controller) { if (!MEMORY_POLICY.GREEDY.equals(getMemPolicy())) { numOfDocsLock.lock(); try { while (getNumOfActiveDocuments() >= getMaxNumberOfDocuments()) { numOfDocsCondition.await(); } getActiveDocuments().add(controller); } catch (InterruptedException e) { throw new PepperFWException("Something went wrong, when waiting for lock 'numOfDocsCondition'.", e); } finally { numOfDocsLock.unlock(); } } return (true); } /** * Releases a document and reduces the internal counter for the number of * currently processed documents ({@link #getNumOfActiveDocuments()}). */ public void releaseDocument(DocumentController controller) { numOfDocsLock.lock(); try { getActiveDocuments().remove(controller); // not sure, if signal() is correct, or if signalAll() should // be used, but I would think, that only one waiter has to be // notified --> seems to work correctly numOfDocsCondition.signal(); } finally { numOfDocsLock.unlock(); } } // ======================================= end: managing number of active // documents /** * {@inheritDoc PepperJob#save(URI)} */ @Override public URI save(URI uri) { if (uri == null) { throw new PepperException( "Cannot save Pepper job '" + getId() + "', because the passed uri is empty. "); } File file = null; if (PepperUtil.FILE_ENDING_PEPPER.equals(uri.fileExtension())) { // passed uri already points to a Pepper file file = new File(uri.toFileString()); } else { // uri points to a directory String directory = uri.toFileString(); if (!directory.endsWith("/")) { directory = directory + "/"; } file = new File(directory + getId() + "." + PepperUtil.FILE_ENDING_PEPPER); } // create parent directory of file if (!file.getParentFile().exists()) { if (!file.getParentFile().mkdirs()) { if (!file.getParentFile().canWrite()) { throw new PepperModuleXMLResourceException( "Cannot create folder '" + file.getParentFile().getAbsolutePath() + "' to store Pepper workflow file, because of an access permission. "); } else { throw new PepperModuleXMLResourceException("Cannot create folder '" + file.getParentFile().getAbsolutePath() + "' to store Pepper workflow file. "); } } ; } XMLOutputFactory xof = XMLOutputFactory.newInstance(); XMLStreamWriter xml; try { xml = new XMLStreamWriter(xof.createXMLStreamWriter(new FileWriter(file.getAbsolutePath()))); xml.setPrettyPrint(true); xml.writeStartDocument(); // <pepper> xml.writeStartElement(WorkflowDescriptionReader.TAG_PEPEPR_JOB); if (getId() != null) { xml.writeAttribute(WorkflowDescriptionReader.ATT_ID, getId()); } xml.writeAttribute(WorkflowDescriptionReader.ATT_VERSION, "1.0"); // <customization> ??? List<StepDesc> importers = new ArrayList<>(); List<StepDesc> manipulators = new ArrayList<>(); List<StepDesc> exporters = new ArrayList<>(); for (StepDesc step : getStepDescs()) { if (MODULE_TYPE.IMPORTER.equals(step.getModuleType())) { importers.add(step); } else if (MODULE_TYPE.MANIPULATOR.equals(step.getModuleType())) { manipulators.add(step); } else if (MODULE_TYPE.EXPORTER.equals(step.getModuleType())) { exporters.add(step); } } // <importer> for (StepDesc step : importers) { xml.writeStartElement(WorkflowDescriptionReader.TAG_IMPORTER); save_module(xml, step); xml.writeEndElement(); } // <manipulator> for (StepDesc step : manipulators) { xml.writeStartElement(WorkflowDescriptionReader.TAG_MANIPULATOR); save_module(xml, step); xml.writeEndElement(); } // <exporter> for (StepDesc step : exporters) { xml.writeStartElement(WorkflowDescriptionReader.TAG_EXPORTER); save_module(xml, step); xml.writeEndElement(); } xml.writeEndElement(); xml.writeEndDocument(); xml.flush(); } catch (XMLStreamException | IOException e) { throw new PepperException("Cannot store Pepper job '" + getId() + "' because of a nested exception. ", e); } return (URI.createFileURI(file.getAbsolutePath())); } /** * This method is just a helper method for method {@link #save(URI)} to * avoid boilerplate code * * @throws XMLStreamException **/ private void save_module(XMLStreamWriter xml, StepDesc step) throws XMLStreamException { if (step.getName() != null) { xml.writeAttribute(WorkflowDescriptionReader.ATT_NAME, step.getName()); } if (step.getCorpusDesc().getFormatDesc().getFormatName() != null) { xml.writeAttribute(WorkflowDescriptionReader.ATT_FORMAT_NAME, step.getCorpusDesc().getFormatDesc().getFormatName()); } if (step.getCorpusDesc().getFormatDesc().getFormatVersion() != null) { xml.writeAttribute(WorkflowDescriptionReader.ATT_FORMAT_VERSION, step.getCorpusDesc().getFormatDesc().getFormatVersion()); } if (step.getVersion() != null) { xml.writeAttribute(WorkflowDescriptionReader.ATT_NAME, step.getName()); } if ((step.getCorpusDesc() != null) && (step.getCorpusDesc().getCorpusPath() != null)) { xml.writeAttribute(WorkflowDescriptionReader.ATT_PATH, step.getCorpusDesc().getCorpusPath().toFileString()); } if ((step.getProps() != null) && (step.getProps().size() > 0)) { xml.writeStartElement(WorkflowDescriptionReader.TAG_CUSTOMIZATION); for (Object key : step.getProps().keySet()) { xml.writeStartElement(WorkflowDescriptionReader.TAG_PROP); xml.writeAttribute(WorkflowDescriptionReader.ATT_KEY, key.toString()); if (step.getProps().get(key) != null) { xml.writeCharacters(step.getProps().get(key).toString()); } xml.writeEndElement(); } xml.writeEndElement(); } } /** * {@inheritDoc PepperJob#clear()} */ @Override public void clear() { // remove all existing steps if (stepDescs != null) { stepDescs.clear(); } if (importSteps != null) { importSteps.clear(); } if (manipulationSteps != null) { manipulationSteps.clear(); } if (exportSteps != null) { exportSteps.clear(); } } /** * {@inheritDoc PepperJob#load(URI)} */ @Override public void load(URI uri) { if (uri.isFile()) { File wdFile = new File(uri.toFileString()); // set folder containing workflow description as base dir setBaseDir(uri.trimSegments(1)); SAXParser parser; XMLReader xmlReader; SAXParserFactory factory = SAXParserFactory.newInstance(); WorkflowDescriptionReader contentHandler = new WorkflowDescriptionReader(); contentHandler.setPepperJob(this); contentHandler.setLocation(uri); // remove all existing steps clear(); try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); } catch (ParserConfigurationException e) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e); } catch (Exception e) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e.getMessage() + ". ", e); } try { InputStream inputStream = new FileInputStream(wdFile); Reader reader = new InputStreamReader(inputStream, "UTF-8"); InputSource is = new InputSource(reader); is.setEncoding("UTF-8"); xmlReader.parse(is); } catch (SAXException e) { try { parser = factory.newSAXParser(); xmlReader = parser.getXMLReader(); xmlReader.setContentHandler(contentHandler); xmlReader.parse(wdFile.getAbsolutePath()); } catch (Exception e1) { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file '" + wdFile.getAbsolutePath() + "': " + e1.getMessage() + ". ", e1); } } catch (Exception e) { if (e instanceof PepperModuleException) { throw (PepperModuleException) e; } else { throw new PepperModuleXMLResourceException("Cannot load Pepper workflow description file'" + wdFile + "', because of a nested exception: " + e.getMessage() + ". ", e); } } } else { throw new UnsupportedOperationException( "Currently Pepper can only load workflow description from local files."); } } /** * Returns a textual representation of this Pepper job. <strong>Note: This * representation could not be used for serialization/deserialization * purposes.</strong> * * @return textual representation */ public String toString() { StringBuilder str = new StringBuilder(); str.append(getId()); if (!getStepDescs().isEmpty()) { str.append("{"); for (StepDesc stepDesc : getStepDescs()) { str.append(stepDesc.getName()); str.append(", "); } str.append("}"); } else { str.append("{"); if (getImportSteps() != null) { for (Step step : getImportSteps()) { str.append(step.getName()); str.append(", "); } } if (getManipulationSteps() != null) { for (Step step : getManipulationSteps()) { str.append(step.getName()); str.append(", "); } } if (getExportSteps() != null) { for (Step step : getExportSteps()) { str.append(step.getName()); str.append(", "); } } str.append("}"); } return (str.toString()); } }