edu.iu.daal_naive.NaiveUtil.java Source code

Java tutorial

Introduction

Here is the source code for edu.iu.daal_naive.NaiveUtil.java

Source

/*
 * Copyright 2013-2016 Indiana University
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package edu.iu.daal_naive;

import it.unimi.dsi.fastutil.ints.IntArrays;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import edu.iu.harp.schdynamic.DynamicScheduler;
import edu.iu.harp.partition.Partition;
import edu.iu.harp.partition.Table;
import edu.iu.harp.resource.DoubleArray;

import edu.iu.data_gen.*;

public class NaiveUtil {
    protected static final Log LOG = LogFactory.getLog(NaiveUtil.class);

    public static List<List<double[]>> loadPoints(List<String> fileNames, int pointsPerFile, int cenVecSize,
            Configuration conf, int numThreads) {
        long startTime = System.currentTimeMillis();
        List<PointLoadTask> tasks = new LinkedList<>();
        List<List<double[]>> arrays = new LinkedList<List<double[]>>();
        for (int i = 0; i < numThreads; i++) {
            tasks.add(new PointLoadTask(cenVecSize, conf));
        }
        DynamicScheduler<String, List<double[]>, PointLoadTask> compute = new DynamicScheduler<>(tasks);
        for (String fileName : fileNames) {
            compute.submit(fileName);
        }
        compute.start();
        compute.stop();
        while (compute.hasOutput()) {
            List<double[]> output = compute.waitForOutput();
            if (output != null) {
                arrays.add(output);
            }
        }
        long endTime = System.currentTimeMillis();
        System.out
                .println("File read (ms): " + (endTime - startTime) + ", number of point arrays: " + arrays.size());
        return arrays;
    }

    static void generatePoints(int numOfDataPoints, int vectorSize, int numPointFiles, int nClasses,
            String localInputDir, FileSystem fs, Path dataDir)
            throws IOException, InterruptedException, ExecutionException {

        int pointsPerFile = numOfDataPoints / numPointFiles;
        System.out.println("Writing " + pointsPerFile + " vectors to a file");
        // Check data directory
        if (fs.exists(dataDir)) {
            fs.delete(dataDir, true);
        }
        // Check local directory
        File localDir = new File(localInputDir);
        // If existed, regenerate data
        if (localDir.exists() && localDir.isDirectory()) {
            for (File file : localDir.listFiles()) {
                file.delete();
            }
            localDir.delete();
        }
        boolean success = localDir.mkdir();
        if (success) {
            System.out.println("Directory: " + localInputDir + " created");
        }
        if (pointsPerFile == 0) {
            throw new IOException("No point to write.");
        }
        // Create random data points
        int poolSize = Runtime.getRuntime().availableProcessors();
        ExecutorService service = Executors.newFixedThreadPool(poolSize);
        List<Future<?>> futures = new LinkedList<Future<?>>();
        for (int k = 0; k < numPointFiles; k++) {
            Future<?> f = service.submit(
                    new DataGenNaiveBayes(localInputDir, Integer.toString(k), pointsPerFile, vectorSize, nClasses));
            futures.add(f); // add a new thread
        }
        for (Future<?> f : futures) {
            f.get();
        }
        // Shut down the executor service so that this
        // thread can exit
        service.shutdownNow();
        // Wrap to path object
        Path localInput = new Path(localInputDir);
        fs.copyFromLocalFile(localInput, dataDir);
    }

    static void generateTestPoints(int numOfDataPoints, int vectorSize, int nClasses, String localInputDir,
            FileSystem fs, Path dataDir) throws IOException, InterruptedException, ExecutionException {

        // Check data directory
        if (fs.exists(dataDir)) {
            fs.delete(dataDir, true);
        }
        // Check local directory
        File localDir = new File(localInputDir);
        // If existed, regenerate data
        if (localDir.exists() && localDir.isDirectory()) {
            for (File file : localDir.listFiles()) {
                file.delete();
            }
            localDir.delete();

        }
        boolean success = localDir.mkdir();
        if (success) {
            System.out.println("Directory: " + localInputDir + " created");
        }

        // generate test points
        BufferedWriter writer = new BufferedWriter(new FileWriter(localInputDir + File.separator + "testdata"));
        Random random = new Random();

        double point = 0;
        int label = 0;
        for (int i = 0; i < numOfDataPoints; i++) {
            for (int j = 0; j < vectorSize; j++) {
                point = random.nextDouble() * 2 - 1;
                writer.write(String.valueOf(point));
                writer.write(",");
            }

            label = random.nextInt(nClasses);
            writer.write(String.valueOf(label));
            writer.newLine();
        }

        writer.close();
        System.out.println("Write test data file");

        // Wrap to path object
        Path localInput = new Path(localInputDir);
        fs.copyFromLocalFile(localInput, dataDir);

    }

    static void generateGroundTruth(int numOfDataPoints, int nClasses, String localInputDir, FileSystem fs,
            Path dataDir) throws IOException, InterruptedException, ExecutionException {

        // Check data directory
        if (fs.exists(dataDir)) {
            fs.delete(dataDir, true);
        }
        // Check local directory
        File localDir = new File(localInputDir);
        // If existed, regenerate data
        if (localDir.exists() && localDir.isDirectory()) {
            for (File file : localDir.listFiles()) {
                file.delete();
            }
            localDir.delete();

        }
        boolean success = localDir.mkdir();
        if (success) {
            System.out.println("Directory: " + localInputDir + " created");
        }

        // generate test points
        BufferedWriter writer = new BufferedWriter(new FileWriter(localInputDir + File.separator + "groundtruth"));
        Random random = new Random();

        // double point = 0;
        int label = 0;
        for (int i = 0; i < numOfDataPoints; i++) {
            // for (int j = 0; j < vectorSize; j++) {
            //    point = random.nextDouble()*2 -1;
            //    writer.write(String.valueOf(point));
            //    writer.write(",");
            // }
            label = random.nextInt(nClasses);
            writer.write(String.valueOf(label));
            writer.newLine();
        }

        writer.close();
        System.out.println("Write groundtruth data file");

        // Wrap to path object
        Path localInput = new Path(localInputDir);
        fs.copyFromLocalFile(localInput, dataDir);

    }

    public static void generateData(int numDataPoints, int numTestPoints, int vectorSize, int numPointFiles,
            int nClasses, FileSystem fs, String localDir, Path trainDir, Path testDir, Path groundTrueDir)
            throws IOException, InterruptedException, ExecutionException {
        System.out.println("Generating training data..... ");
        generatePoints(numDataPoints, vectorSize, numPointFiles, nClasses, localDir, fs, trainDir);

        System.out.println("Generating test data..... ");
        generateTestPoints(numTestPoints, vectorSize, nClasses, localDir, fs, testDir);

        System.out.println("Generating groundtruth data..... ");
        generateGroundTruth(numTestPoints, nClasses, localDir, fs, groundTrueDir);

        DeleteFileFolder(localDir);

    }

    public static void DeleteFileFolder(String path) {

        File file = new File(path);
        if (file.exists()) {
            do {
                delete(file);
            } while (file.exists());
        } else {
            System.out.println("File or Folder not found : " + path);
        }

    }

    private static void delete(File file) {
        if (file.isDirectory()) {
            String fileList[] = file.list();
            if (fileList.length == 0) {
                System.out.println("Deleting Directory : " + file.getPath());
                file.delete();
            } else {
                int size = fileList.length;
                for (int i = 0; i < size; i++) {
                    String fileName = fileList[i];
                    System.out.println("File path : " + file.getPath() + " and name :" + fileName);
                    String fullPath = file.getPath() + "/" + fileName;
                    File fileOrFolder = new File(fullPath);
                    System.out.println("Full Path :" + fileOrFolder.getPath());
                    delete(fileOrFolder);
                }
            }
        } else {
            System.out.println("Deleting file : " + file.getPath());
            file.delete();
        }
    }
}