Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package development; import fileIO.InFile; import fileIO.OutFile; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Random; import java.util.logging.Level; import java.util.logging.Logger; import utilities.ClassifierTools; import weka.classifiers.Classifier; import weka.classifiers.bayes.BayesNet; import weka.classifiers.bayes.NaiveBayes; import weka.classifiers.functions.SMO; import weka.classifiers.functions.supportVector.PolyKernel; import weka.classifiers.lazy.kNN; import weka.classifiers.meta.HeterogeneousEnsemble; import weka.classifiers.meta.RotationForest; import weka.classifiers.trees.J48; import weka.classifiers.trees.RandomForest; import weka.core.Instances; import weka.filters.NormalizeCase; import weka.filters.timeseries.shapelet_transforms.FullShapeletTransform; import weka.filters.timeseries.shapelet_transforms.ShapeletTransform; import weka.filters.timeseries.shapelet_transforms.ShapeletTransformDistCaching; /** * * @author ajb */ public class CrossValidateShapelets extends Thread { Instances train; Instances test; int fold; String path; public static String fileName; public static boolean useCluster = true; //UCR ONES FIRST static int[] missing = { 45, 46, 61, 71, 72, 73, 74, 20, 26, 27, 57 }; static int[] incomplete = { 45, 46, 61, 71, 72, 73, 74, 20, 26, 27, 57 }; public CrossValidateShapelets(Instances tr, Instances te, int f, String path) { train = tr; test = te; fold = f; this.path = path; } public static void formCV() { //Delete any existing shapelet files for the incomplete for (int i = 0; i < incomplete.length; i++) { File f = new File("/gpfs/sys/ajb/TSC Problems/" + DataSets.fileNames[incomplete[i]] + "/ShapeletCV/"); //Delete everything there if (f.exists()) { try { delete(f); } catch (IOException e) { System.err.println(" Unable to delete directory ShapeletCV/ Continuing "); } } //Recreate the directory if (!f.exists()) { f.mkdir(); } } for (int i = 0; i < missing.length; i++) { String clusterPath = "/gpfs/sys/ajb/TSC Problems/" + DataSets.fileNames[missing[i]] + "/"; String dropboxPath = "C:/Users/ajb/Dropbox/TSC Problems/" + DataSets.fileNames[missing[i]] + "/"; // String path=dropboxPath; String path = clusterPath; Instances train = ClassifierTools.loadData(path + DataSets.fileNames[missing[i]] + "_TRAIN"); System.out.println("Processing : " + DataSets.fileNames[missing[i]]); NormalizeCase nc = new NormalizeCase(); try { train = nc.process(train); } catch (Exception e) { System.out.println(" Unable to normalise for some unknown reason " + e + " but continuing..."); } //Randomize the data. Need to save the mapping somewhere. int[] positions = new int[train.numInstances()]; train = randomise(train, positions); OutFile of = new OutFile(path + "ShapeletCV/InstancePositions.csv"); for (int j = 0; j < positions.length; j++) of.writeLine(positions[j] + ","); of = new OutFile(path + "InstancePositions.csv"); for (int j = 0; j < positions.length; j++) of.writeLine(positions[j] + ","); //Split into time domain folds int folds = 10; Instances[] trainFolds = new Instances[folds]; Instances[] testFolds = new Instances[folds]; splitTrainData(train, trainFolds, testFolds, folds); //Save folds to file for (int j = 1; j <= folds; j++) { OutFile of1 = new OutFile(path + DataSets.fileNames[missing[i]] + "_TRAIN" + (j) + ".arff"); OutFile of2 = new OutFile(path + DataSets.fileNames[missing[i]] + "_TEST" + (j) + ".arff"); of1.writeLine(trainFolds[j - 1].toString()); of2.writeLine(testFolds[j - 1].toString()); } } } public void run() { //Perform cached on online FullShapeletTransform st = new ShapeletTransformDistCaching(); st.useCandidatePruning(10); // if(train.numInstances()>=500 || train.numAttributes()>500) // st = new ShapeletTransform(); st.supressOutput(); st.setNumberOfShapelets(Math.max(train.numAttributes(), train.numInstances())); try { Instances sTrain = st.process(train); Instances sTest = st.process(test); OutFile of1 = new OutFile(path + fileName + "_TRAIN" + (fold + 1) + ".arff"); OutFile of2 = new OutFile(path + fileName + "_TEST" + (fold + 1) + ".arff"); of1.writeLine(sTrain.toString()); of2.writeLine(sTest.toString()); } catch (Exception ex) { Logger.getLogger(CrossValidateShapelets.class.getName()).log(Level.SEVERE, null, ex); } } public static void splitTrainData(Instances train, Instances[] trainFolds, Instances[] testFolds, int folds) { int size = train.numInstances(); int foldSize = size / folds; int[] foldCV = new int[folds]; for (int i = 0; i < foldCV.length; i++) foldCV[i] = foldSize; if (size % folds != 0) //Adjust the last fold size accordingly foldCV[folds - 1] = size - foldSize * (folds - 1); int diff = foldCV[folds - 1] - foldSize; int c = 0; while (diff > 0) { //Reassign elements to other folds foldCV[c % (folds - 1)]++; foldCV[folds - 1]--; diff = foldCV[folds - 1] - foldCV[c % (folds - 1)]; c++; } Instances copy = new Instances(train); int start = 0; for (int i = 0; i < folds; i++) { trainFolds[i] = new Instances(copy, 0); testFolds[i] = new Instances(copy, 0); for (int j = 0; j < train.numInstances(); j++) { if (j < start || j >= start + foldCV[i]) trainFolds[i].add(train.instance(j)); else testFolds[i].add(train.instance(j)); } start += foldCV[i]; } } public static Instances randomise(Instances train, int[] pos) { //Generate a random permutation into pos Random r = new Random(); for (int i = 0; i < pos.length; i++) pos[i] = i; for (int i = 0; i < pos.length; i++) { int p1 = r.nextInt(pos.length); int p2 = r.nextInt(pos.length); int temp = pos[p1]; pos[p1] = pos[p2]; pos[p2] = temp; } Instances newD = new Instances(train, 0); for (int i = 0; i < pos.length; i++) newD.add(train.instance(pos[i])); return newD; } public static void singleRunThreaded(String file) { // String file ="ItalyPowerDemand"; String clusterPath = "/gpfs/sys/ajb/TSC Problems/" + file + "/"; String desktopPath = "C:/Users/ajb/Dropbox/TSC Problems/" + file + "/"; String path = desktopPath; if (useCluster) path = clusterPath; String filePath = path + "ShapeletCV/"; int count = 0; //Create directory if it isn't there already File dir = new File(filePath); if (!dir.exists()) { dir.mkdir(); } else { //Comment out to allow overwriting boolean present = true; for (int i = 1; i <= 10 && present; i++) { File cv = new File(filePath + file + "_TRAIN" + i + ".arff"); File cv2 = new File(filePath + file + "_TEST" + i + ".arff"); if (cv.exists() && cv2.exists()) { //CV files already there count++; } else present = false; } if (count == 10)//Exit now return; } CrossValidateShapelets.fileName = file; Instances train = ClassifierTools.loadData(path + file + "_TRAIN"); NormalizeCase nc = new NormalizeCase(); try { train = nc.process(train); } catch (Exception e) { System.out.println(" Unable to normalise for some unknown reason " + e + " but continuing..."); } //Randomize the data. Need to save the mapping somewhere. int[] positions = new int[train.numInstances()]; train = randomise(train, positions); OutFile of = new OutFile(filePath + "InstancePositions.csv"); for (int i = 0; i < positions.length; i++) of.writeLine(positions[i] + ","); //Split data into folds int folds = 10; Instances[] trainFolds = new Instances[folds]; Instances[] testFolds = new Instances[folds]; splitTrainData(train, trainFolds, testFolds, folds); CrossValidateShapelets[] ct = new CrossValidateShapelets[folds]; for (int i = 0; i < folds; i++) { ct[i] = new CrossValidateShapelets(trainFolds[i], testFolds[i], i, filePath); } for (int i = 0; i < folds; i++) { //Only start the threads where file is not their ct[i].start(); } try { for (int i = 0; i < folds; i++) ct[i].join(); } catch (InterruptedException ex) { Logger.getLogger(CrossValidateShapelets.class.getName()).log(Level.SEVERE, null, ex); } } public static int countFiles(String file) { String path = "/gpfs/sys/ajb/TSC Problems/" + file + "/"; String filePath = path + "ShapeletCV/"; //See if it has already done the job. If so, dont bother! boolean b = false; int count = 0; for (int j = 1; j <= 10; j++) { File cv = new File(filePath + file + "_TRAIN" + j + ".arff"); if (cv.exists()) //CV files already there count++; } return count; } public static void checkTransforms() { int totalCount = 0; for (int i = 0; i < DataSets.fileNames.length; i++) { String path = "/gpfs/sys/ajb/TSC Problems/" + DataSets.fileNames[i] + "/"; String filePath = path + "ShapeletCV/"; //See if it has already done the job. If so, dont bother! boolean b = false; int count = 0; for (int j = 1; j <= 10; j++) { File cv = new File(filePath + DataSets.fileNames[i] + "_TRAIN" + j + ".arff"); if (cv.exists()) //CV files already there count++; } if (count == 10) totalCount++; else System.out.println("PROBLEM" + DataSets.fileNames[i] + " IN POSITION " + i + " ONLY " + count + " CV FILES COMPLETED"); } System.out.println("TOTAL COMPLETED = " + totalCount); } public static int[][] classifyFold(String file, int fold) { String clusterPath = "/gpfs/sys/ajb/TSC Problems/" + file + "/"; String desktopPath = "C:/Users/ajb/Dropbox/TSC Problems/" + file + "/"; String path = desktopPath; if (useCluster) path = clusterPath; String filePath = path + "ShapeletCV/"; //Check training and test files exist, terminate if not File tr = new File(filePath + file + "_TRAIN" + fold + ".arff"); File ts = new File(filePath + file + "_TEST" + fold + ".arff"); if (!tr.exists() || !ts.exists()) { System.err.println(" ERROR CLASSIFYING " + file + " fold " + fold + " file does not exist"); return null; } //Check whether predictions exist, terminate if not. File r = new File(filePath + file + "Predictions" + fold + ".csv"); if (r.exists()) { System.err.println(file + " fold " + fold + " Classificastion already done"); return null; } Instances train = ClassifierTools.loadData(filePath + file + "_TRAIN" + fold); Instances test = ClassifierTools.loadData(filePath + file + "_TEST" + fold); ArrayList<String> names = new ArrayList<>(); ArrayList<Classifier> c = setSingleClassifiers(names); HeterogeneousEnsemble hc = new HeterogeneousEnsemble(c); hc.useCVWeighting(true); int[][] preds = new int[2][test.numInstances()]; try { hc.buildClassifier(train); for (int i = 0; i < test.numInstances(); i++) { preds[0][i] = (int) test.instance(i).classValue(); preds[1][i] = (int) hc.classifyInstance(test.instance(i)); } } catch (Exception ex) { Logger.getLogger(CrossValidateShapelets.class.getName()).log(Level.SEVERE, null, ex); } //Save results to the appropriate file double[] cvAccs = hc.getWeights(); OutFile results = new OutFile(filePath + file + "Predictions" + fold + ".csv"); for (int i = 0; i < cvAccs.length; i++) results.writeString(cvAccs[i] + ","); results.writeString("\n Actual,Predicted\n"); int correct = 0; for (int i = 0; i < preds[0].length; i++) { results.writeString(preds[0][i] + "," + preds[1][i] + "\n"); if (preds[0][i] == preds[1][i]) correct++; } System.out.println( " Fold =" + fold + " correct =" + correct + " acc = " + ((double) correct) / preds[0].length); return preds; } public static void combineandInvertFolds() { OutFile all = new OutFile("/gpfs/sys/ajb/shapeletCV/TrainCV.csv"); // OutFile all=new OutFile("C:/Users/ajb/Dropbox/TSC Problems/TrainCV.csv"); fileLoop: for (int i = 0; i < DataSets.fileNames.length; i++) { all.writeString("\n" + DataSets.fileNames[i]); //Check predictions exist, if not, ignore String path = "/gpfs/sys/ajb/TSC Problems/"; // String path="C:/Users/ajb/Dropbox/TSC Problems/"; int count = 0; for (int j = 1; j <= 10; j++) { File f = new File(path + DataSets.fileNames[i] + "/ShapeletCV/" + DataSets.fileNames[i] + "Predictions" + j + ".csv"); if (f.exists()) count++; } if (count < 10) { //Skip this problem System.out.println( " Not enough Prediction files for problem " + DataSets.fileNames[i] + " num =" + count); continue fileLoop; } //Check if combined file exists. If it does, do nothing. // File f=new File("/gpfs/sys/ajb/shapeletCV/"+DataSets.fileNames[i]+"Preds.csv"); // if(f.exists())//Skip this problem // continue fileLoop; //Concatinate into a single file String str = "/gpfs/sys/ajb/shapeletCV/"; // String str= "/gpfs/sys/ajb/TSC Problems/"; OutFile of = new OutFile(str + DataSets.fileNames[i] + "Preds.csv"); of.writeLine("actual,predicted"); OutFile of2 = new OutFile(str + DataSets.fileNames[i] + "CV_Accs.csv"); ArrayList<int[]> preds = new ArrayList<>(); int lines; InFile inF; for (int j = 1; j <= 10; j++) { inF = new InFile(path + DataSets.fileNames[i] + "/ShapeletCV/" + DataSets.fileNames[i] + "Predictions" + j + ".csv"); lines = inF.countLines() - 2; System.out.println(" Number of lines =" + lines); inF = new InFile(path + DataSets.fileNames[i] + "/ShapeletCV/" + DataSets.fileNames[i] + "Predictions" + j + ".csv"); of2.writeLine(inF.readLine()); inF.readLine(); for (int k = 0; k < lines; k++) { int[] d = new int[2]; d[0] = inF.readInt(); d[1] = inF.readInt(); preds.add(d); } } //Load ordering int[] orders = new int[preds.size()]; inF = new InFile(path + DataSets.fileNames[i] + "/ShapeletCV/InstancePositions.csv"); lines = inF.countLines(); if (lines != preds.size()) { //ERROR System.err.println( " BIG ERROR: reording number does not equal the number of cases in the file!!! Problem =" + DataSets.fileNames[i]); System.err.println(" \t\t in recorded positions there are" + lines + " in the combo results there are " + preds.size()); continue fileLoop; } inF = new InFile(path + DataSets.fileNames[i] + "/ShapeletCV/InstancePositions.csv"); for (int k = 0; k < lines; k++) orders[k] = inF.readInt(); //Reorder into original //Work out Cv Train Accuracy int[][] results = new int[lines][]; int correct = 0; for (int k = 0; k < lines; k++) { results[orders[k]] = preds.get(k); if (results[orders[k]][0] == results[orders[k]][1]) correct++; } //Print to file for (int k = 0; k < lines; k++) of.writeLine(results[k][0] + "," + results[k][0]); all.writeString("," + ((double) correct) / lines); } } public static ArrayList<Classifier> setSingleClassifiers(ArrayList<String> names) { ArrayList<Classifier> sc = new ArrayList<>(); kNN n = new kNN(50); n.setCrossValidate(true); sc.add(n); names.add("kNN"); sc.add(new J48()); names.add("C45"); sc.add(new NaiveBayes()); names.add("NB"); BayesNet bn = new BayesNet(); sc.add(bn); names.add("BayesNet"); RandomForest rf = new RandomForest(); rf.setNumTrees(200); sc.add(rf); names.add("RandForest"); RotationForest rot = new RotationForest(); rot.setNumIterations(30); sc.add(rf); names.add("RotForest"); SMO svmL = new SMO(); PolyKernel kernel = new PolyKernel(); kernel.setExponent(1); svmL.setKernel(kernel); sc.add(svmL); names.add("SVML"); kernel = new PolyKernel(); kernel.setExponent(2); SMO svmQ = new SMO(); svmQ.setKernel(kernel); sc.add(svmQ); names.add("SVMQ"); return sc; } public static void doTransform(String[] args) { // checkTrainsforms(); // System.exit(0); if (args.length == 0) { useCluster = false; System.out.println(" ON DESKTOP"); int pos = 1; System.out.println(" Transforming :" + DataSets.fileNames[34]); singleRunThreaded("ItalyPowerDemand"); } else { useCluster = true; int num = Integer.parseInt(args[0]); int problemNum = num - 1; System.out.println(" Transforming =" + DataSets.fileNames[problemNum]); singleRunThreaded(DataSets.fileNames[problemNum]); } } public static void classifyProblem(String[] args) { if (args.length == 0) { useCluster = false; System.out.println(" ON DESKTOP"); int pos = 1; // System.out.println(" Classifying :"+DataSets.fileNames[pos]); for (int i = 1; i <= 10; i++) { int[][] res = classifyFold("ItalyPowerDemand", i); } } else { useCluster = true; int n = Integer.parseInt(args[0]) - 1; int problemNum = n / 10; int foldNum = n % 10; //Results saved to individual files int[][] res = classifyFold(DataSets.fileNames[problemNum], foldNum + 1); } } public static void purge() { //Delete all CV files from the cluster useCluster = true; for (int i = 0; i < DataSets.fileNames.length; i++) { String clusterPath = "/gpfs/sys/ajb/TSC Problems/" + DataSets.fileNames[i] + "/"; String desktopPath = "C:/Users/ajb/Dropbox/TSC Problems/" + DataSets.fileNames[i] + "/"; String path = desktopPath; if (useCluster) path = clusterPath; File f = new File(path + "ShapeletCV/"); if (f.exists()) { try { delete(f); } catch (IOException e) { System.err.println(" Unable to delete directory " + path + "ShapeletCV/ Continuing "); } } } } public static void delete(File file) throws IOException { if (file.isDirectory()) { //directory is empty, then delete it if (file.list().length == 0) { file.delete(); System.out.println("Directory is deleted : " + file.getAbsolutePath()); } else { //list all the directory contents String files[] = file.list(); for (String temp : files) { //construct the file structure File fileDelete = new File(file, temp); //recursive delete delete(fileDelete); } //check the directory again, if empty then delete it if (file.list().length == 0) { file.delete(); System.out.println("Directory is deleted : " + file.getAbsolutePath()); } } } else { //Base case //if file, then delete it file.delete(); System.out.println("File is deleted : " + file.getAbsolutePath()); } } public static void transformIncomplete(String[] args) { int length = incomplete.length; //12 of these int n = Integer.parseInt(args[0]) - 1; int problemNum = n / 10; int foldNum = n % 10; if (problemNum >= length) //Error return; problemNum = incomplete[problemNum]; doSingleTransform(problemNum, foldNum); } public static void doSingleTransform(int problemNum, int foldNum) { String fileName = DataSets.fileNames[problemNum]; String clusterPath = "/gpfs/sys/ajb/TSC Problems/" + fileName + "/"; String path = clusterPath; String shapeletPath = path + "ShapeletCV/"; File f1 = new File(shapeletPath + fileName + "_TRAIN" + (foldNum + 1) + ".arff"); File f2 = new File(shapeletPath + fileName + "_TEST" + (foldNum + 1) + ".arff"); if (f1.exists() && f2.exists()) { System.out.println(" Transform " + foldNum + " problem " + fileName + " already exists"); return; } Instances train = ClassifierTools.loadData(clusterPath + fileName + "_TRAIN" + (foldNum + 1)); Instances test = ClassifierTools.loadData(clusterPath + fileName + "_TEST" + (foldNum + 1)); FullShapeletTransform st = new ShapeletTransformDistCaching(); // if(train.numInstances()>=500 || train.numAttributes()>500) // st = new ShapeletTransform(); st.supressOutput(); st.setNumberOfShapelets(Math.max(train.numAttributes(), train.numInstances())); try { Instances sTrain = st.process(train); Instances sTest = st.process(test); OutFile of1 = new OutFile(shapeletPath + fileName + "_TRAIN" + (foldNum + 1) + ".arff"); OutFile of2 = new OutFile(shapeletPath + fileName + "_TEST" + (foldNum + 1) + ".arff"); of1.writeLine(sTrain.toString()); of2.writeLine(sTest.toString()); } catch (Exception ex) { Logger.getLogger(CrossValidateShapelets.class.getName()).log(Level.SEVERE, null, ex); } } public static void shapeletTrainSingle(String file) { String clusterPath = "/gpfs/sys/ajb/ShapeletTransformed/"; String desktopPath = "C:/Users/ajb/Dropbox/TSC Problems/ShapeletTransformed/"; String path = desktopPath; if (useCluster) path = clusterPath; //Load OutFile of = new OutFile(path + "TrainCV/" + file + "_trainCVacc.csv"); File f = new File(path + file + "Transformed_TRAIN"); if (!f.exists()) { of.writeLine(file + "," + "-1"); } Instances train = ClassifierTools.loadData(path + file + "Transformed_TRAIN"); //Get classifiers ArrayList<String> names = new ArrayList<>(); ArrayList<Classifier> c = setSingleClassifiers(names); HeterogeneousEnsemble hc = new HeterogeneousEnsemble(c); hc.useCVWeighting(true); //Find Accuracy double acc = ClassifierTools.stratifiedCrossValidation(train, hc, 10, 1); //Get individual stats //Write to file of.writeLine(file + "," + acc); } public static void shapeletTrainCV(String[] args) { if (args.length == 0) { useCluster = false; System.out.println(" ON DESKTOP"); int pos = 1; System.out.println(" Transforming :" + DataSets.fileNames[34]); shapeletTrainSingle("ItalyPowerDemand"); } else { useCluster = true; int num = Integer.parseInt(args[0]); int problemNum = num - 1; System.out.println(" Transforming =" + DataSets.fileNames[problemNum]); shapeletTrainSingle(DataSets.fileNames[problemNum]); } } public static void combineShapeletTrain() { String path = "C:/Users/ajb/Dropbox/Results/ShapeletDomain/TrainCV/"; OutFile combo = new OutFile(path + "allResults.csv"); for (String s : DataSets.fileNames) { File f = new File(path + s + "_trainCVacc.csv"); if (!f.exists()) { combo.writeLine(s + ","); System.out.println(path + s + "_trainCVacc.csv" + " DOES NOT EXIST"); } else { InFile inf = new InFile(path + s + "_trainCVacc.csv"); String str = inf.readLine(); combo.writeLine(str + "," + inf.readLine()); } } } public static void main(String[] args) { // formCV(); // transformIncomplete(args); // doTransform(args); // classifyProblem(args); // combineandInvertFolds(); // shapeletTrainCV(args); combineShapeletTrain(); } }