uk.ac.kcl.texthunter.core.MLModelMaker.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.kcl.texthunter.core.MLModelMaker.java

Source

//    Text Hunter: User friendly concept extraction from text
//
//    Copyright (C) 2014  Richard Jackson (richgjackson@yahoo.co.uk)
//
//    This program is free software: you can redistribute it and/or modify
//    it under the terms of the GNU General Public License as published by
//    the Free Software Foundation, either version 3 of the License, or
//    (at your option) any later version.
//
//    This program is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU General Public License for more details.
//
//    You should have received a copy of the GNU General Public License
//    along with this program.  If not, see <http://www.gnu.org/licenses/>.

package uk.ac.kcl.texthunter.core;

import uk.ac.kcl.texthunter.utils.Utils;
import gate.*;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.persist.PersistenceException;
import gate.util.AnnotationDiffer;
import gate.util.persistence.PersistenceManager;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.FileUtils;

public class MLModelMaker {

    private Corpus fincorpus;
    private Corpus opcorpus;
    private int threadCount;
    private CopyOnWriteArrayList<XValidate> alXValidate;
    private Iterator<XValidate> xvalARit;
    private long outerCorpSizeRatio;
    private ProjectXMLhandler projectXML;
    private File allClassGapp;
    private int folds = 2;
    private Corpus goldStandardCorpus;

    public File getAllClassGapp() {
        return allClassGapp;
    }

    public CopyOnWriteArrayList<Parameter> getParamsList() {
        return paramsList;
    }

    private String annSetA = null;
    private String annSetB = null;
    private String type = null;
    private String feature = null;
    private String resultsFile = null;
    private String MLConfigFile = null;
    private Parameter currentParams;
    private Corpus trainingDocs = null;
    private String applicationLocation;
    private CorpusController allClassApp;
    private Parameter finalParams;
    private boolean roughValidation;
    private volatile CopyOnWriteArrayList<Parameter> paramsList;
    private boolean resumeXValidation;
    private volatile double bestResultF1 = 0.0;
    private static final Object loadLock = new Object();
    private static final Object writeLock = new Object();
    private CountDownLatch cdl;
    private TextHunterMLCCWrapper app;
    private boolean multiClassMode;
    private volatile boolean busy;

    public synchronized boolean isBusy() {
        return busy;
    }

    public synchronized void setBusy(boolean busy) {
        this.busy = busy;
    }

    private static MLModelMaker instance = new MLModelMaker();

    private MLModelMaker() {
        this.busy = false;
        //new way with goldstandard xval
        this.annSetA = GlobalParameters.TESTKEYANNOTSETNAME;
        this.annSetB = GlobalParameters.MLANNOTSETNAME;
        this.type = GlobalParameters.CONTEXT;
        this.feature = GlobalParameters.MLFEATURENAME;
        this.outerCorpSizeRatio = 5;
        this.annSetA = GlobalParameters.TESTKEYANNOTSETNAME;
        this.annSetB = GlobalParameters.MLANNOTSETNAME;
        this.type = GlobalParameters.CONTEXT;
        this.feature = GlobalParameters.MLFEATURENAME;
    }
    //    MLModelMaker(String appLoc, String resultsLoc,
    //            int foldsNo, String gateHomeLoc,
    //            Corpus allDocsCorpus, boolean rough,
    //            boolean resume, int threadCount,
    //            ProjectXMLhandler projectXML, boolean multiClassMode) {
    //        //old way with internal xval
    //        this.alXValidate = new CopyOnWriteArrayList();
    //        this.applicationLocation = appLoc;
    //        applicationLocation
    //        this.folds = foldsNo;
    //        this.annSetA = GlobalParameters.TESTKEYANNOTSETNAME;
    //        this.annSetB = GlobalParameters.MLANNOTSETNAME;
    //        this.type = GlobalParameters.CONTEXT;
    //        this.feature = GlobalParameters.MLFEATURENAME;
    //        this.resultsFile = resultsLoc;
    //        this.MLConfigFile = appLoc + File.separator + "MLconfig.xml";
    //        this.trainingDocs = allDocsCorpus;
    //        this.roughValidation = rough;
    //        this.resumeXValidation = resume;
    //        this.threadCount = threadCount;
    //        this.outerCorpSizeRatio = 5;
    //        this.projectXML = projectXML;
    //        this.multiClassMode = multiClassMode;
    //        if (this.multiClassMode) {
    //            System.out.println("TextHunter is in MultiClass mode");
    //        } else {
    //            System.out.println("TextHunter is in Positive Instance mode");
    //        }
    //    }
    //

    //    and other
    //    MLModelMaker(String appLoc, String resultsLoc,
    //            int foldsNo,
    //            Corpus goldStandardCorpus, Corpus modelTrainingCorpus, boolean rough,
    //            boolean resume, int threadCount,
    //            ProjectXMLhandler projectXML, boolean multiClassMode) {
    //        //new way with goldstandard xval
    //        this.alXValidate = new CopyOnWriteArrayList();
    //        this.applicationLocation = appLoc;
    //        this.allClassGapp = new File(appLoc + File.separator + "all_classes.gapp");
    //        this.folds = foldsNo;
    //        this.annSetA = GlobalParameters.TESTKEYANNOTSETNAME;
    //        this.annSetB = GlobalParameters.MLANNOTSETNAME;
    //        this.type = GlobalParameters.CONTEXT;
    //        this.feature = GlobalParameters.MLFEATURENAME;
    //        this.resultsFile = resultsLoc;
    //        this.MLConfigFile = appLoc + File.separator + "MLconfig.xml";
    //        this.goldStandardCorpus = goldStandardCorpus;
    //        this.trainingDocs = modelTrainingCorpus;
    //        this.roughValidation = rough;
    //        this.resumeXValidation = resume;
    //        this.threadCount = threadCount;
    //        this.outerCorpSizeRatio = 5;
    //        this.projectXML = projectXML;
    //        this.multiClassMode = multiClassMode;
    //
    //        if (this.multiClassMode) {
    //            System.out.println("TextHunter is in MultiClass mode");
    //        } else {
    //            System.out.println("TextHunter is in Positive Instance mode");
    //        }
    //    }

    public Corpus getFincorpus() {
        return fincorpus;
    }

    public void setFincorpus(Corpus fincorpus) {
        this.fincorpus = fincorpus;
    }

    public Corpus getOpcorpus() {
        return opcorpus;
    }

    public void setOpcorpus(Corpus opcorpus) {
        this.opcorpus = opcorpus;
    }

    public int getThreadCount() {
        return threadCount;
    }

    public void setThreadCount(int threadCount) {
        this.threadCount = threadCount;
    }

    public CopyOnWriteArrayList<XValidate> getAlXValidate() {
        return alXValidate;
    }

    public void setAlXValidate(CopyOnWriteArrayList<XValidate> alXValidate) {
        this.alXValidate = alXValidate;
    }

    public Iterator<XValidate> getXvalARit() {
        return xvalARit;
    }

    public void setXvalARit(Iterator<XValidate> xvalARit) {
        this.xvalARit = xvalARit;
    }

    public long getOuterCorpSizeRatio() {
        return outerCorpSizeRatio;
    }

    public void setOuterCorpSizeRatio(long outerCorpSizeRatio) {
        this.outerCorpSizeRatio = outerCorpSizeRatio;
    }

    public ProjectXMLhandler getProjectXML() {
        return projectXML;
    }

    public void setProjectXML(ProjectXMLhandler projectXML) {
        this.projectXML = projectXML;
    }

    public int getFolds() {
        return folds;
    }

    public void setFolds(int folds) {
        this.folds = folds;
    }

    public Corpus getGoldStandardCorpus() {
        return goldStandardCorpus;
    }

    public void setGoldStandardCorpus(Corpus goldStandardCorpus) {
        this.goldStandardCorpus = goldStandardCorpus;
    }

    public String getAnnSetA() {
        return annSetA;
    }

    public void setAnnSetA(String annSetA) {
        this.annSetA = annSetA;
    }

    public String getAnnSetB() {
        return annSetB;
    }

    public void setAnnSetB(String annSetB) {
        this.annSetB = annSetB;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getFeature() {
        return feature;
    }

    public void setFeature(String feature) {
        this.feature = feature;
    }

    public String getResultsFile() {
        return resultsFile;
    }

    public void setResultsFile(String resultsFile) {
        this.resultsFile = resultsFile;
    }

    public String getMLConfigFile() {
        return MLConfigFile;
    }

    public void setMLConfigFile(String MLConfigFile) {
        this.MLConfigFile = MLConfigFile;
    }

    public Parameter getCurrentParams() {
        return currentParams;
    }

    public void setCurrentParams(Parameter currentParams) {
        this.currentParams = currentParams;
    }

    public Corpus getTrainingDocs() {
        return trainingDocs;
    }

    public void setTrainingDocs(Corpus trainingDocs) {
        this.trainingDocs = trainingDocs;
    }

    public String getApplicationLocation() {
        return applicationLocation;
    }

    public void setApplicationLocation(String applicationLocation) {
        this.applicationLocation = applicationLocation;
    }

    public CorpusController getAllClassApp() {
        return allClassApp;
    }

    public void setAllClassApp(CorpusController allClassApp) {
        this.allClassApp = allClassApp;
    }

    public Parameter getFinalParams() {
        return finalParams;
    }

    public void setFinalParams(Parameter finalParams) {
        this.finalParams = finalParams;
    }

    public boolean isRoughValidation() {
        return roughValidation;
    }

    public void setRoughValidation(boolean roughValidation) {
        this.roughValidation = roughValidation;
    }

    public boolean isResumeXValidation() {
        return resumeXValidation;
    }

    public void setResumeXValidation(boolean resumeXValidation) {
        this.resumeXValidation = resumeXValidation;
    }

    public double getBestResultF1() {
        return bestResultF1;
    }

    public void setBestResultF1(double bestResultF1) {
        this.bestResultF1 = bestResultF1;
    }

    public CountDownLatch getCdl() {
        return cdl;
    }

    public void setCdl(CountDownLatch cdl) {
        this.cdl = cdl;
    }

    public TextHunterMLCCWrapper getApp() {
        return app;
    }

    public void setApp(TextHunterMLCCWrapper app) {
        this.app = app;
    }

    public boolean isMultiClassMode() {
        return multiClassMode;
    }

    public void setMultiClassMode(boolean multiClassMode) {
        this.multiClassMode = multiClassMode;
    }

    public static MLModelMaker getInstance() {
        return instance;
    }

    public static void setInstance(MLModelMaker instance) {
        MLModelMaker.instance = instance;
    }

    public void loadApps() {
        try {
            this.allClassApp = (CorpusController) PersistenceManager.loadObjectFromFile(this.allClassGapp);
            this.app = new TextHunterMLCCWrapper(allClassApp);
        } catch (IOException | ResourceInstantiationException | PersistenceException ex) {
            Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public void setResume(boolean choice) {
        resumeXValidation = choice ? true : false;
    }

    public void executeTrainAll() {
        try {
            setBusy(true);
            //Train only    
            resetModel();
            System.out.println();
            System.out.println();
            finalParams = projectXML.getBestParameter();
            System.out.println("Training best model on " + trainingDocs.size() + " documents on model ID "
                    + finalParams.getID());
            //save best app before training             
            app.changeFeatureSelection(finalParams);
            finalParams.xmlConfigGenerator(MLConfigFile);
            app.reinitialisePRs();
            app.getApp().setCorpus(null);
            app.setTrainingMode("APPLICATION_ALL_CLASSES");
            PersistenceManager.saveObjectToFile(app.getApp(), allClassGapp, true, false);
            //execute training
            app.setTrainingMode("TRAINING");
            app.getApp().setCorpus(trainingDocs);
            app.getApp().execute();
        } catch (IOException | ExecutionException | PersistenceException | ResourceInstantiationException ex) {
            Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            ArrayList<ProcessingResource> trPRsList = new ArrayList(app.getApp().getPRs());
            for (ProcessingResource pr : trPRsList) {
                Factory.deleteResource(pr);
            }
            Factory.deleteResource(app.getApp());
            setBusy(false);
        }
    }

    public FeatureMap executeXVal() {
        setBusy(true);
        //before threads are started set countdown to one for cleanup routine
        cdl = new CountDownLatch(1);
        //load docs
        ArrayList<Corpus> corpusSetup;
        try {
            if (Thread.currentThread().isInterrupted()) {
                throw new InterruptedException();
            }

            try {
                File file = new File(resultsFile);
                File xvalResultsFile = new File(
                        file.getAbsoluteFile() + File.separator + "crossValidationResults.tsv");
                if (resumeXValidation) {
                    ArrayList<Parameter> prevResults = parseCmdLineParamFile(xvalResultsFile);
                    this.folds = prevResults.get(0).getFolds();
                    if (prevResults.get(0).getType().equals("clean")) {
                        paramsList = Parameter.generateParamList(false, folds);
                    } else {
                        paramsList = Parameter.generateParamList(true, folds);
                    }
                    for (Parameter toDoParam : paramsList) {
                        for (Parameter savedParam : prevResults) {
                            if (toDoParam.getID() == savedParam.getID()) {
                                toDoParam.setF1(savedParam.getF1());
                                toDoParam.setPrecision(savedParam.getPrecision());
                                toDoParam.setRecall(savedParam.getRecall());
                            }
                        }
                    }
                } else {
                    paramsList = Parameter.generateParamList(roughValidation, folds);
                }

                if (Thread.currentThread().isInterrupted()) {
                    throw new InterruptedException();
                }
                if (folds > trainingDocs.size()) {
                    System.out.println("Insufficient documents available (" + trainingDocs.size() + ") for " + folds
                            + " folds.");
                } else {
                    File xvalCorpusDir = new File(resultsFile + File.separator + "xvalCorpus");
                    File finCorpusDir = new File(resultsFile + File.separator + "outerCorpus");

                    File tempAppFolder = new File(resultsFile + File.separator + "temp");
                    if (!tempAppFolder.exists()) {
                        tempAppFolder.mkdir();
                    }
                    if (!resumeXValidation) {
                        for (int i = 0; i <= 5; i++) {
                            try {
                                Thread.sleep(500);
                                FileUtils.deleteDirectory(xvalCorpusDir);
                                FileUtils.deleteDirectory(finCorpusDir);
                                break;
                            } catch (IOException ex) {
                                System.out.println("Attempting deletion " + i);
                            }
                        }
                        corpusSetup = setUpCorpora(trainingDocs, true);
                        this.fincorpus = corpusSetup.get(0);
                        this.opcorpus = corpusSetup.get(1);
                        // if file doesnt exists, then create it
                        if (file.mkdir()) {
                            System.out.println("Directory Created");
                        } else {
                            System.out.println("Directory is not created");
                        }
                        //save corpora in case of interruption
                        if (finCorpusDir.mkdir()) {
                            System.out.println("outerCorpus dir Created");
                        } else {
                            System.out.println("outerCorpus dir  is not created");
                        }

                        if (Thread.currentThread().isInterrupted()) {
                            throw new InterruptedException();
                        }
                        for (gate.Document doc : fincorpus) {
                            try {
                                gate.corpora.DocumentStaxUtils.writeDocument(doc,
                                        new File(finCorpusDir.getAbsolutePath() + File.separator + doc.getName()));
                            } catch (XMLStreamException | IOException ex) {
                                Logger.getLogger(AnnotationEditor.class.getName()).log(Level.SEVERE, null, ex);
                            }
                        }

                        if (xvalCorpusDir.mkdir()) {
                            System.out.println("xvalCorpus dir Created");
                        } else {
                            System.out.println("xvalCorpus dir  is not created");
                        }
                        for (gate.Document doc : opcorpus) {
                            try {
                                gate.corpora.DocumentStaxUtils.writeDocument(doc,
                                        new File(xvalCorpusDir.getAbsolutePath() + File.separator + doc.getName()));
                            } catch (XMLStreamException | IOException ex) {
                                Logger.getLogger(AnnotationEditor.class.getName()).log(Level.SEVERE, null, ex);
                            }
                        }

                    } else {
                        fincorpus = Factory.newCorpus("OuterCorpus");
                        opcorpus = Factory.newCorpus("OptimisationCorpus");
                        this.fincorpus.populate((finCorpusDir.toURI().toURL()), null, "UTF-8", false);
                        this.opcorpus.populate(xvalCorpusDir.toURI().toURL(), null, "UTF-8", false);
                    }
                    if (getProgressThroughParamList() == paramsList.size()) {
                        return null;
                    }

                    //write header
                    FileWriter fw = new FileWriter((xvalResultsFile), resumeXValidation);
                    try (BufferedWriter bw = new BufferedWriter(fw)) {
                        String content = "type\tID\tc\tt\td\ttau\tnegex\trmStops\tmissing\tspurious\tcorrect\tP\tR\tF1\tfolds";
                        if (!resumeXValidation) {
                            bw.write(content);
                            bw.newLine();
                        }
                    }
                    int processorCount = threadCount;
                    //                int processorCount = 1;
                    int paramsPerCore = paramsList.size() / processorCount;
                    int[][] paramIDAL = new int[processorCount][2];

                    int runningCount = 0;
                    for (int i = 0; i <= processorCount - 1; i++) {

                        if (i == 0) {
                            paramIDAL[i][0] = 0;
                            paramIDAL[i][1] = paramsPerCore;
                        } else if (i == processorCount - 1) {
                            paramIDAL[i][0] = runningCount;
                            paramIDAL[i][1] = paramsList.size();
                        } else {
                            paramIDAL[i][0] = runningCount;
                            paramIDAL[i][1] = runningCount + paramsPerCore;
                        }
                        runningCount = runningCount + paramsPerCore + 1;
                    }
                    for (int i = 0; i <= processorCount - 1; i++) {
                        XValidate xval = new XValidate(opcorpus, paramIDAL[i]);
                        synchronized (loadLock) {
                            xval.loadThreadApps();
                        }
                        alXValidate.add(xval);
                    }
                    System.out.println("X val ready to go with " + alXValidate.size() + " threads!");
                    this.xvalARit = alXValidate.iterator();
                    if (Thread.currentThread().isInterrupted()) {
                        throw new InterruptedException();
                    }
                    for (int i = 0; i <= processorCount - 1; i++) {
                        newThread nt = new newThread(xvalARit.next());
                        Thread t = new Thread(nt);
                        t.start();
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException ex) {
                        }
                    }
                    //once threads are started, set CDL to number of threads and 
                    cdl = new CountDownLatch(processorCount + 1);
                    if (Thread.currentThread().isInterrupted()) {
                        throw new InterruptedException();
                    }

                }
            } catch (IOException | ResourceInstantiationException ex) {
                Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
            }
        } catch (InterruptedException ex) {
        }
        //count down by one as MLmodelmaker has run its main routine
        cdl.countDown();
        return null;
    }

    public Integer getProgressThroughParamList() {
        Integer i = 0;

        for (Parameter param : paramsList) {
            if (param.getF1() != null) {
                i++;
            }
        }
        return i;

    }

    public Parameter outerValidation() {
        setBusy(true);
        FileWriter fw2 = null;
        try {
            synchronized (loadLock) {
                loadApps();
            }
            ArrayList<Parameter> newAR = new ArrayList<>();
            newAR.addAll(paramsList);
            File file = new File(resultsFile);
            Collections.sort(newAR, new F1ParameterComparator());
            //reset bestResult to hold comparisons of best models
            double bestResultf1 = 0.0;
            fw2 = new FileWriter(file.getAbsoluteFile() + File.separator + "nestedValidationResults.tsv");
            ArrayList<Prob> probValues;
            TreeMap<String, String> observations;
            try (BufferedWriter bw2 = new BufferedWriter(fw2)) {
                System.out.println("testing best models against unseen corpus");
                String content = "type\tID\tc\tt\td\ttau\tnegex\trmStops\tmissing\tspurious\tcorrect\tP\tR\tF1\tfolds";
                bw2.write(content);
                bw2.newLine();
                probValues = new ArrayList();
                int i = 1;
                observations = new TreeMap();
                for (gate.Document doc : fincorpus) {
                    String keyAnnotID = doc.getName();
                    String ob = doc.getAnnotations(GlobalParameters.KEYANNOTSETNAME)
                            .get(GlobalParameters.KEYANNOTSETTYPE).get(0).getFeatures()
                            .get(GlobalParameters.MLFEATURENAME).toString();
                    observations.put(keyAnnotID, ob);
                }
                for (Parameter resultParam : newAR) {
                    resultParam.setPrecision(0.0);
                    resultParam.setRecall(0.0);
                    resultParam.setF1(0.0);
                    currentParams = resultParam;
                    resetModel();
                    resultParam.xmlConfigGenerator(MLConfigFile);
                    app.changeFeatureSelection(resultParam);
                    synchronized (loadLock) {
                        app.reinitialisePRs();
                    }
                    app.setTrainingMode("TRAINING");
                    app.getApp().setCorpus(opcorpus);
                    app.getApp().execute();

                    synchronized (loadLock) {
                        app.reinitialisePRs();
                        //reinitialisePRs(applicationApp.getPRs());     
                    }
                    if (multiClassMode) {
                        app.setTrainingMode("APPLICATION_ALL_CLASSES");
                    } else {
                        app.setTrainingMode("APPLICATION_POS_ONLY");
                    }
                    app.getApp().setCorpus(fincorpus);
                    app.getApp().execute();
                    //                 applicationApp.setCorpus(fincorpus);
                    //                 applicationApp.execute();
                    int[] nestedResults = evaluate(fincorpus, true);
                    resultParam.calculateResults(nestedResults[0], nestedResults[1], nestedResults[2]);

                    //calculate if best model when testing more than one
                    if (resultParam.getF1() == bestResultf1) {
                        finalParams = resultParam;
                        System.out.println("Model is equal to best.");

                    } else if (resultParam.getF1() > bestResultf1 && bestResultf1 == 0.0) {
                        finalParams = resultParam;
                        System.out.println("first model tested");
                        bestResultf1 = resultParam.getF1();
                    } else if (resultParam.getF1() > bestResultf1) {
                        finalParams = resultParam;
                        System.out.println("Model is better!");
                        bestResultf1 = resultParam.getF1();
                    }

                    bw2.write(resultParam.getType() + "\t" + resultParam.getID() + "\t" + resultParam.getC() + "\t"
                            + resultParam.getT() + "\t" + resultParam.getD() + "\t" + resultParam.getTau() + "\t"
                            + resultParam.isRUN_NEGEX() + "\t" + resultParam.isRemoveStopWords() + "\t"
                            + nestedResults[0] + "\t" + nestedResults[1] + "\t" + nestedResults[2] + "\t"
                            + resultParam.getPrecision() + "\t" + resultParam.getRecall() + "\t"
                            + resultParam.getF1() + "\t" + resultParam.getFolds());
                    bw2.newLine();
                    bw2.flush();
                    System.out.println("Testing on unseen data done:");
                    System.out.println("Missing = " + nestedResults[0] + ", Spurious = " + nestedResults[1]
                            + ", Correct = " + nestedResults[2]);
                    System.out.println("Precision = " + resultParam.getPrecision() + ", Recall = "
                            + resultParam.getRecall() + ", F1 = " + resultParam.getF1());
                    //capture probabilities
                    synchronized (loadLock) {
                        app.reinitialisePRs();
                    }

                    app.setTrainingMode("APPLICATION_ALL_CLASSES");
                    app.getApp().setCorpus(fincorpus);
                    app.getApp().execute();
                    Prob result = new Prob();
                    result.setModelID(resultParam.getID());
                    for (gate.Document doc : fincorpus) {
                        String keyAnnotID = doc.getName();
                        float prob;
                        if (gate.Utils.getOnlyAnn(doc.getAnnotations("ML")).getFeatures().get("observation")
                                .toString().equalsIgnoreCase("positive")) {
                            prob = Float.parseFloat(gate.Utils.getOnlyAnn(doc.getAnnotations("ML")).getFeatures()
                                    .get("prob").toString());
                        } else {
                            prob = Float.parseFloat(gate.Utils.getOnlyAnn(doc.getAnnotations("ML")).getFeatures()
                                    .get("prob").toString()) * -1;
                        }
                        result.getMap().put(keyAnnotID, prob);
                    }
                    probValues.add(result);
                    //break after top x models
                    if (i == GlobalParameters.MODELSTOCARRYFORWARD) {
                        break;
                    }
                    i++;
                }
            }
            writeBestModelProbs(probValues, file, observations);
            //finally, set paremeters as best bested
            resetModel();
            app.getApp().setCorpus(null);
        } catch (IOException | ExecutionException | ResourceInstantiationException ex) {
            Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                fw2.close();
            } catch (IOException ex) {
                Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        setBusy(false);
        return finalParams;
    }

    public void writeBestModelProbs(ArrayList<Prob> probValues, File file, TreeMap<String, String> observations)
            throws IOException {
        try (BufferedWriter bw3 = new BufferedWriter(
                new FileWriter(file.getAbsoluteFile() + File.separator + "bestModelProbabilities.tsv"))) {
            System.out.println("outputting probabilites");
            Collections.sort(probValues);
            String probHeader = "annotID\tobservation\t";
            for (Prob result : probValues) {
                probHeader = probHeader + result.getModelID() + "\t";
            }
            bw3.write(probHeader);
            bw3.newLine();
            for (Map.Entry<String, String> entry : observations.entrySet()) {
                String probLine = String.valueOf(entry.getKey()) + "\t" + entry.getValue() + "\t";
                for (Prob result : probValues) {
                    probLine = probLine + String.valueOf(result.getMap().get(entry.getKey())) + "\t";
                }
                bw3.write(probLine);
                bw3.newLine();
            }
        }
    }

    public void cleanUp() {
        setBusy(true);
        Thread.currentThread().interrupt();
        if (this.alXValidate != null) {
            for (XValidate xval : this.alXValidate) {
                if (xval.thread != null) {
                    try {
                        xval.thread.interrupt();
                    } catch (Exception ex) {
                        System.out.println(ex);
                    }
                }
            }
            if (Thread.interrupted()) {
                try {
                    cdl.await(2, TimeUnit.MINUTES);
                } catch (InterruptedException ex) {
                    Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
                } catch (NullPointerException ex) {
                }
            }
            Utils.deleteGateResources(loadLock);
        }
        setBusy(false);
    }

    public void setDefaultParameters(Parameter parameters) {
        parameters.xmlConfigGenerator(MLConfigFile);
        app.changeFeatureSelection(parameters);
    }

    private void resetModel() throws IOException {

        //delete model between runs
        File mlModelDirectory = new File(MLConfigFile).getParentFile();
        File learnedModels = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                + File.separator + "learnedModels.save");
        File featureVectorsData = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                + File.separator + "featureVectorsData.save");
        File LabelsList = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                + File.separator + "LabelsList.save");
        File NLPFeatureData = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                + File.separator + "NLPFeatureData.save");
        File NLPFeatureList = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                + File.separator + "NLPFeatureList.save");
        for (int i = 0; i <= 5; i++) {
            try {
                Thread.sleep(500);
                FileUtils.deleteDirectory(learnedModels);
            } catch (java.io.IOException | InterruptedException ex) {
                System.out.println("Attempting to Cancel");
            }
        }
        featureVectorsData.delete();
        LabelsList.delete();
        NLPFeatureData.delete();
        NLPFeatureList.delete();

    }

    public ArrayList<Corpus> setUpCorpora(Corpus corpus, boolean goldStandard) {
        ArrayList<Corpus> setupCorpus = new ArrayList();
        if (goldStandard) {
            fincorpus = goldStandardCorpus;
            opcorpus = corpus;
            setupCorpus.add(fincorpus);
            setupCorpus.add(opcorpus);
        } else {
            try {
                fincorpus = Factory.newCorpus("OuterCorpus");
                opcorpus = Factory.newCorpus("OptimisationCorpus");
                int pos = 0;
                for (int thisdoc = 0; thisdoc < corpus.size(); thisdoc++) {
                    gate.Document doc1 = corpus.get(thisdoc);
                    if (pos == 0) {
                        fincorpus.add(doc1);
                    } else {
                        opcorpus.add(doc1);
                    }
                    pos++;
                    if (pos >= outerCorpSizeRatio) {
                        pos = 0;
                    }
                }
                System.out.println("setup module outer corpus size = " + fincorpus.size());
                System.out.println("setup module optimization corpus size = " + opcorpus.size());
                setupCorpus.add(fincorpus);
                setupCorpus.add(opcorpus);
            } catch (ResourceInstantiationException ex) {
                System.out.println(ex);
            }
        }
        System.out.println("positive observations in training set " + getPosCount(opcorpus));
        System.out.println("positive observations in testing set " + getPosCount(fincorpus));
        return setupCorpus;
    }

    private int getPosCount(Corpus corpus) {
        int posCount = 0;
        for (gate.Document doc : corpus) {
            for (gate.Annotation annot : doc.getAnnotations("Key").get("ManualAnnotation")) {
                if (annot.getFeatures().get("observation").toString().equalsIgnoreCase("positive")) {
                    posCount++;
                }
            }
        }
        return posCount;
    }

    private int[] evaluate(Corpus corpus, boolean errorAnalysis) {
        int[] results = new int[3];
        int foldmissing = 0;
        int foldspurious = 0;
        int foldcorrect = 0;

        for (gate.Document doc : corpus) {
            AnnotationSet manual = doc.getAnnotations(annSetA).get(type);
            AnnotationSet auto = doc.getAnnotations(annSetB).get(type);
            Set<String> importantFeatures = Collections.singleton(feature);
            AnnotationDiffer differ = new AnnotationDiffer();
            differ.setSignificantFeaturesSet(importantFeatures);
            differ.calculateDiff(manual, auto); // Key, Response
            Set<Annotation> missingSet = differ.missingAnnotations;
            Set<Annotation> spuriousSet = differ.spuriousAnnotations;
            Set<Annotation> correctSet = differ.correctAnnotations;
            foldmissing += missingSet.size();
            foldspurious += spuriousSet.size();
            foldcorrect += correctSet.size();
            if (errorAnalysis) {
                outputErroneousClassifications(spuriousSet, doc, "spurious");
                outputErroneousClassifications(missingSet, doc, "missing");
            }
        }

        System.out.println(
                "Fold missing: " + foldmissing + ", spurious: " + foldspurious + ", correct: " + foldcorrect + ".");

        results[0] = foldmissing;
        results[1] = foldspurious;
        results[2] = foldcorrect;
        return results;
    }

    private void outputErroneousClassifications(Set<Annotation> annotationSet, gate.Document doc, String type) {
        FileWriter fw;
        try {
            File file = new File(resultsFile + File.separator + type + "_annotations.tsv");

            fw = new FileWriter(file, true);
            try (BufferedWriter bw = new BufferedWriter(fw)) {
                if (file.length() == 0) {
                    String content = "id\tc\tt\td\ttau\tnegex\trmStops\tstring\tdocName";
                    bw.write(content);
                    bw.newLine();
                }
                for (Annotation annot : annotationSet) {
                    String annotString = gate.Utils.cleanStringFor(doc, annot);
                    bw.write(currentParams.getID() + "\t" + currentParams.getC() + "\t" + currentParams.getT()
                            + "\t" + currentParams.getD() + "\t" + currentParams.getTau() + "\t"
                            + currentParams.isRUN_NEGEX() + "\t" + currentParams.isRemoveStopWords() + "\t"
                            + annotString + "\t" + doc.getName());
                    bw.newLine();
                    bw.flush();

                }
            }

        } catch (IOException ex) {
            System.out.println(ex);
        }
    }

    private static ArrayList<Parameter> parseCmdLineParamFile(File resultsFile) {
        FileReader fr = null;
        ArrayList<Parameter> returnArrayList = new ArrayList();
        try {
            fr = new FileReader(resultsFile);
            BufferedReader br = new BufferedReader(fr);
            String line;
            //skiip header
            br.readLine();
            while ((line = br.readLine()) != null) {
                System.out.println(line);
                String[] lineArray = line.split("\t");
                Parameter parameters = new Parameter(lineArray[0].toString(), Integer.parseInt(lineArray[1]),
                        Double.parseDouble(lineArray[2]), Integer.parseInt(lineArray[3]),
                        Integer.parseInt(lineArray[4]), Double.parseDouble(lineArray[5]),
                        Boolean.parseBoolean(lineArray[6]), Boolean.parseBoolean(lineArray[7]),
                        //                        Integer.parseInt( lineArray[7]),   
                        //                        Integer.parseInt( lineArray[8]),   
                        //                        Integer.parseInt( lineArray[9]),                           
                        Double.parseDouble(lineArray[11]), Double.parseDouble(lineArray[12]),
                        Double.parseDouble(lineArray[13]), Integer.parseInt(lineArray[14]));

                returnArrayList.add(parameters);
            }
        } catch (FileNotFoundException ex) {
            System.out.println("Bad Line");
            System.out.println(ex);
        } catch (IOException ex) {
            System.out.println(ex);
        } catch (NumberFormatException ex) {
            // wasn't a valid number
            System.out.println(ex);
        } finally {
            try {
                fr.close();
            } catch (IOException ex) {
                System.out.println(ex);
            }
            return returnArrayList;
        }
    }

    private static String getCurrentTime() {
        Calendar cal = Calendar.getInstance();
        cal.getTime();
        SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
        return sdf.format(cal.getTime());
    }

    public void resumeXValidation() {
        setBusy(true);
        try {
            FileReader fr = new FileReader(resultsFile + File.separator + "crossValidationResults.tsv");
            try (BufferedReader br = new BufferedReader(fr)) {
                String line = null;
                String[] lastLine = null;

                while ((line = br.readLine()) != null) {
                    lastLine = line.split("\t");

                }

                if (lastLine[0].equalsIgnoreCase("rough")) {
                    paramsList = Parameter.generateParamList(true, folds);
                } else {
                    paramsList = Parameter.generateParamList(false, folds);
                }

                Iterator<Parameter> it = paramsList.iterator();

                while (it.hasNext()) {
                    if (it.next().getID() <= Integer.parseInt(lastLine[1])) {
                        it.remove();
                    }
                }
            }
        } catch (FileNotFoundException ex) {
            Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    private void checkBestParamsPopulated() {
        throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
    }

    public void prepareForBlastOff() {
        this.alXValidate = new CopyOnWriteArrayList();
        this.allClassGapp = new File(applicationLocation + File.separator + "all_classes.gapp");
        this.MLConfigFile = applicationLocation + File.separator + "MLconfig.xml";
        this.outerCorpSizeRatio = 5;
        if (this.multiClassMode) {
            System.out.println("TextHunter is in MultiClass mode");
        } else {
            System.out.println("TextHunter is in Positive Instance mode");
        }
    }

    public class newThread implements Runnable {

        public XValidate xval;

        newThread(XValidate xval) {
            this.xval = xval;
        }

        @Override
        public void run() {
            xval.execute();

        }
    }

    public class XValidate {

        private File threadTrainingGapp = null;
        private File threadApplicationGapp = null;
        private String threadMLConfigFile;
        public Corpus masterXValCorpus = null;
        private String threadApplicationLocation;
        public Controller threadTrainingApp;
        //        public Controller threadApplicationApp;
        public TextHunterMLCCWrapper threadApp;
        //public CorpusController threadAllClassApp;
        int[] IDsToProcess;
        Corpus threadXValCorpus;
        private Parameter currentParams;
        public Thread thread;
        File tempDir;

        XValidate(Corpus xValCorpus, int[] paramIDAL) {
            this.threadMLConfigFile = null;
            this.masterXValCorpus = xValCorpus;
            this.IDsToProcess = paramIDAL;
        }

        private void writeResults(Parameter parameters, int[] results) throws IOException {
            synchronized (writeLock) {
                bestResultF1 = parameters.getF1();
                FileWriter fw = new FileWriter(
                        (new File(resultsFile) + File.separator + "crossValidationResults.tsv"), true);
                BufferedWriter bw = new BufferedWriter(fw);

                //write allFoldsResults to file
                bw.write(parameters.getType() + "\t" + parameters.getID() + "\t" + parameters.getC() + "\t"
                        + parameters.getT() + "\t" + parameters.getD() + "\t" + parameters.getTau() + "\t"
                        + parameters.isRUN_NEGEX() + "\t" + parameters.isRemoveStopWords() + "\t" + results[0]
                        + "\t" + results[1] + "\t" + results[2] + "\t" + parameters.getPrecision() + "\t"
                        + parameters.getRecall() + "\t" + parameters.getF1() + "\t" + parameters.getFolds());
                bw.newLine();
                bw.flush();
            }
        }

        private void threadResetModel() {
            try {
                //delete model between runs
                File mlModelDirectory = new File(threadMLConfigFile).getParentFile();
                System.out.println(mlModelDirectory.getCanonicalPath() + " thread ml config loc");
                File learnedModels = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                        + File.separator + "learnedModels.save");
                File featureVectorsData = new File(mlModelDirectory.getAbsolutePath() + File.separator
                        + "savedFiles" + File.separator + "featureVectorsData.save");
                File LabelsList = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                        + File.separator + "LabelsList.save");
                File NLPFeatureData = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                        + File.separator + "NLPFeatureData.save");
                File NLPFeatureList = new File(mlModelDirectory.getAbsolutePath() + File.separator + "savedFiles"
                        + File.separator + "NLPFeatureList.save");
                FileUtils.deleteDirectory(learnedModels);
                featureVectorsData.delete();
                LabelsList.delete();
                NLPFeatureData.delete();
                NLPFeatureList.delete();
            } catch (IOException ex) {
                System.out.println(ex);
            }
        }

        public void loadThreadApps() {
            try {
                //make temp dir
                File tempAppFolder = new File(resultsFile + File.separator + "temp");
                File applicationFolder = new File(applicationLocation);
                Path path = Files.createTempDirectory(tempAppFolder.toPath(), null);
                tempDir = path.toFile();
                //copy files        
                Utils.copyFolder(applicationFolder, tempDir);
                //set fields
                this.threadApplicationLocation = path.toString();
                this.threadTrainingGapp = new File(threadApplicationLocation + File.separator + "all_classes.gapp");
                this.threadMLConfigFile = threadApplicationLocation + File.separator + "MLconfig.xml";
                threadXValCorpus = Factory.newCorpus("Thread Corpus");
                synchronized (loadLock) {
                    for (gate.Document doc : masterXValCorpus) {
                        gate.Document newDoc = (gate.Document) Factory.duplicate(doc);
                        newDoc.getAnnotations("Key").addAll(doc.getAnnotations("Key"));
                        threadXValCorpus.add(newDoc);
                    }
                    this.threadTrainingApp = (CorpusController) PersistenceManager
                            .loadObjectFromFile(threadTrainingGapp);
                }

                threadApp = new TextHunterMLCCWrapper(threadTrainingApp);
            } catch (PersistenceException | IOException | ResourceInstantiationException ex) {
                Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

        public void execute() {
            ///normal bit
            this.thread = Thread.currentThread();
            try {
                Iterator<Parameter> it = paramsList.iterator();
                while (it.hasNext()) {
                    Parameter parameters = it.next();
                    System.out.println(parameters.getID() + " vs " + IDsToProcess[0] + " " + IDsToProcess[1]);
                    if (parameters.getID() >= IDsToProcess[0] & parameters.getID() <= IDsToProcess[1]
                            & parameters.getF1() == null) {
                        //for use elsewhere
                        currentParams = parameters;
                        //update SVM XML
                        System.out.println("Adjusting model parameters..." + "C__" + parameters.getC() + "T__"
                                + parameters.getT() + "D__" + parameters.getD() + "TAU__" + parameters.getTau()
                                + "__negex " + parameters.isRUN_NEGEX() + "__rmStops "
                                + parameters.isRemoveStopWords());
                        System.out.println();
                        parameters.xmlConfigGenerator(threadMLConfigFile);
                        threadApp.changeFeatureSelection(parameters);
                        //return final cross validated allFoldsResults
                        System.out.println("Beginning cross validation...");
                        System.out.println();
                        System.out.println();
                        int[] results = crossValidate2();
                        if (Thread.interrupted()) {
                            throw new InterruptedException();
                        }
                        parameters.calculateResults(results[0], results[1], results[2]);
                        writeResults(parameters, results);
                        //delete model between runs
                        System.out.println("cross validation of Parameter Results done - C = " + parameters.getC()
                                + " T = " + parameters.getT() + " D = " + parameters.getD() + " tau = "
                                + parameters.getTau() + " negex = " + parameters.isRUN_NEGEX() + " rmStops = "
                                + parameters.isRemoveStopWords() + " Total Missing = " + results[0]
                                + " Total Spurious = " + results[1] + " Total Correct = " + results[2]
                                + " precision = " + parameters.getPrecision() + " recall = "
                                + parameters.getRecall() + " f1 = " + parameters.getF1());
                    }
                }
            } catch (IOException | ResourceInstantiationException | ExecutionException ex) {
                Logger.getLogger(MLModelMaker.class.getName()).log(Level.SEVERE, null, ex);
            } catch (InterruptedException ex) {
                System.out.println("Thread " + thread.getName() + " successfully stopped");
            }
            cdl.countDown();
        }

        private int[] crossValidate2() throws ResourceInstantiationException, ExecutionException,
                gate.creole.ExecutionInterruptedException, InterruptedException {

            int[] allFoldsResults = new int[3];

            for (int fold = 0; fold < folds; fold++) {
                if (Thread.interrupted()) {
                    throw new InterruptedException();
                }
                Corpus threadtrcorpus = Factory.newCorpus("TrainingCorpus");
                Corpus threadapcorpus = Factory.newCorpus("ApplicationCorpus");
                int pos = fold;
                for (int thisdoc = 0; thisdoc < threadXValCorpus.size(); thisdoc++) {
                    gate.Document doc1 = threadXValCorpus.get(thisdoc);
                    if (pos == 0) {
                        threadapcorpus.add(doc1);
                    } else {
                        threadtrcorpus.add(doc1);
                    }
                    pos++;
                    if (pos >= folds) {
                        pos = 0;
                    }
                }
                System.out.println("positive observations in training set " + getPosCount(threadtrcorpus));
                System.out.println("positive observations in testing set " + getPosCount(threadapcorpus));

                //apply the apps to the corpora
                System.out.println("Fold " + fold + ": Applying training application to " + threadtrcorpus.size()
                        + " documents.");
                System.out.println();
                threadResetModel();
                synchronized (loadLock) {
                    threadApp.reinitialisePRs();
                }
                threadApp.setTrainingMode("TRAINING");
                threadApp.getApp().setCorpus(threadtrcorpus);
                //for some reason, this is needed to prevent crashes with the training mode?
                //                synchronized (loadLock) {
                //                    Thread.sleep(2000);
                //                }
                long startTimeMs = System.currentTimeMillis();
                threadApp.getApp().execute();
                long taskTimeMs = System.currentTimeMillis() - startTimeMs;
                System.out.println("ParamID " + currentParams.getID() + " on Thead " + thread.getName()
                        + " train Mode done in " + (taskTimeMs / 1000) + "s");

                System.out.println("Fold " + fold + ": Applying application application to " + threadapcorpus.size()
                        + " documents.");
                System.out.println();
                System.out.println();
                System.out.println();
                synchronized (loadLock) {
                    threadApp.reinitialisePRs();
                }
                if (multiClassMode) {
                    threadApp.setTrainingMode("APPLICATION_ALL_CLASSES");
                } else {
                    threadApp.setTrainingMode("APPLICATION_POS_ONLY");
                }

                threadApp.getApp().setCorpus(threadapcorpus);
                startTimeMs = System.currentTimeMillis();
                threadApp.getApp().execute();
                taskTimeMs = System.currentTimeMillis() - startTimeMs;
                System.out.println("ParamID " + currentParams.getID() + " on Thead " + thread.getName()
                        + " apply Mode done in " + (taskTimeMs / 1000) + "s");

                //Evaluate - use all folds results array to temp hold results of a single fold. add to total ....
                int[] currentFoldResult = threadEvaluate(threadapcorpus);

                //missing
                allFoldsResults[0] = allFoldsResults[0] + currentFoldResult[0];
                //spurious
                allFoldsResults[1] = allFoldsResults[1] + currentFoldResult[1];
                //correct
                allFoldsResults[2] = allFoldsResults[2] + currentFoldResult[2];
                Factory.deleteResource(threadapcorpus);
                Factory.deleteResource(threadtrcorpus);
            }
            return allFoldsResults;
        }

        private int[] threadEvaluate(Corpus corpus) {
            int[] results = new int[3];
            int foldmissing = 0;
            int foldspurious = 0;
            int foldcorrect = 0;

            for (gate.Document doc : corpus) {
                AnnotationSet manual = doc.getAnnotations(annSetA).get(type);
                AnnotationSet auto = doc.getAnnotations(annSetB).get(type);
                Set<String> importantFeatures = Collections.singleton(feature);
                AnnotationDiffer differ = new AnnotationDiffer();
                differ.setSignificantFeaturesSet(importantFeatures);
                differ.calculateDiff(manual, auto); // Key, Response
                Set<Annotation> missingSet = differ.missingAnnotations;
                Set<Annotation> spuriousSet = differ.spuriousAnnotations;
                Set<Annotation> correctSet = differ.correctAnnotations;
                foldmissing += missingSet.size();
                foldspurious += spuriousSet.size();
                foldcorrect += correctSet.size();
            }

            System.out.println("Fold missing: " + foldmissing + ", spurious: " + foldspurious + ", correct: "
                    + foldcorrect + ".");

            results[0] = foldmissing;
            results[1] = foldspurious;
            results[2] = foldcorrect;
            return results;
        }
    }
}