NCDSearch.NCDSearch.java Source code

Java tutorial

Introduction

Here is the source code for NCDSearch.NCDSearch.java

Source

/* 
   I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license 
   With Love,
   -Scott!
   gitpushoriginmaster@gmail.com
*/

package NCDSearch;

import java.io.*;
import java.util.*;
import java.util.zip.Deflater;
import org.apache.commons.io.FileUtils;

public class NCDSearch {

    //Constants
    private static final int COMPRESSION = 2;

    private String bestpath = "Directory Empty!";
    private double bestscore = Integer.MAX_VALUE;
    private static Random rand = new Random(System.currentTimeMillis());
    private byte[] targetbytes;

    public NCDSearch(String targetpath, String path) {

        try {
            this.targetbytes = org.apache.commons.io.FileUtils.readFileToByteArray(new File(targetpath));
        } catch (IOException e) {
            System.out.println(e.toString());
            System.out.println("Could not find target!" + targetpath);
        }

        search(path, COMPRESSION);
    }

    // Recursively searches through the directory given by path, using the NCD to look for similar files.
    private void search(String path, int compression) {

        byte[] filebytes;

        File directory = new File(path);

        File[] files = directory.listFiles();
        double score;

        for (File file : files) {
            if (file.isDirectory()) {
                search(file.getPath(), compression);
            } else {
                try {

                    FileInputStream reader = new FileInputStream(file);
                    while (reader.available() > 0) {

                        filebytes = new byte[Math.min(reader.available(), 10 * targetbytes.length)];
                        reader.read(filebytes);
                        score = NCD(filebytes, targetbytes, compression);

                        if (score < bestscore) {
                            this.bestscore = score;
                            this.bestpath = file.getPath();
                        }
                    }

                } catch (IOException e) {
                    System.out.println(e.toString());
                    System.out.println("Could not read file!" + file.getPath());
                }
            }
        }
    }

    //Computes a modified one directional normalized compression distance.  Computes a small distance if the target is a subfile of file. 
    public double NCD(byte[] file, byte[] target, int compression) {

        Deflater compressor = new Deflater(compression);

        byte[] outputtrash = new byte[file.length + target.length];

        int bothcompressedsize;
        int filecompressedsize;
        int targetcompressedsize;

        byte[] both = new byte[file.length + target.length];
        for (int i = 0; i < file.length; i++) {
            both[i] = file[i];
        }
        for (int i = 0; i < target.length; i++) {
            both[i + file.length] = target[i];
        }

        compressor.setInput(file);
        compressor.finish();
        filecompressedsize = compressor.deflate(outputtrash);
        compressor.reset();
        compressor.setInput(target);
        compressor.finish();
        targetcompressedsize = compressor.deflate(outputtrash);
        compressor.reset();
        compressor.setInput(both);
        compressor.finish();
        bothcompressedsize = compressor.deflate(outputtrash);
        compressor.reset();

        return (double) (bothcompressedsize - filecompressedsize) / (double) targetcompressedsize;
    }

    public String getMatch() {
        return bestpath;
    }

    public double getScore() {
        return bestscore;
    }

    public static void main(String[] args) {
        NCDSearch find = new NCDSearch(args[0], args[1]);
        System.out.println(find.getMatch());
        System.out.println(find.getScore());
    }

}