org.codesecure.dependencycheck.utils.SSDeep.java Source code

Introduction

Here is the source code for org.codesecure.dependencycheck.utils.SSDeep.java
Source

/* ssdeep
   Copyright (C) 2006 ManTech International Corporation
    
   $Id: fuzzy.c 97 2010-03-19 15:10:06Z jessekornblum $
    
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
       
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
       
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    
   The code in this file, and this file only, is based on SpamSum, part 
   of the Samba project: 
     http://www.samba.org/ftp/unpacked/junkcode/spamsum/
    
   Because of where this file came from, any program that contains it
   must be licensed under the terms of the General Public License (GPL).
   See the file COPYING for details. The author's original comments
   about licensing are below:
    
    
    
  this is a checksum routine that is specifically designed for spam. 
  Copyright Andrew Tridgell <tridge@samba.org> 2002
    
  This code is released under the GNU General Public License version 2
  or later.  Alteratively, you may also use this code under the terms
  of the Perl Artistic license.
    
  If you wish to distribute this code under the terms of a different
  free software license then please ask me. If there is a good reason
  then I will probably say yes.
      
*/

//package eu.scape_project.bitwiser.utils;
//https://raw.github.com/openplanets/bitwiser/master/src/main/java/eu/scape_project/bitwiser/utils/SSDeep.java
package org.codesecure.dependencycheck.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Arrays;

import org.apache.commons.lang.StringUtils;

/**
 * SSDeep
 *
 * <p>
 * A Java version of the ssdeep algorithm, based on the fuzzy.c source 
 * code, taken from version 2.6 of the ssdeep package.
 * 
 * <p>
 * Transliteration/port to Java from C by...
 * 
 * @author Andrew Jackson <Andrew.Jackson@bl.uk>
 *
 */
public class SSDeep {

    public class FuzzyHash {
        /** the blocksize used by the program, */
        int blocksize;
        /** the hash for this blocksize */
        String hash;
        /** the hash for twice the blocksize, */
        String hash2;
        /** the filename. */
        String filename;
    }

    /// Length of an individual fuzzy hash signature component
    public static final int SPAMSUM_LENGTH = 64;

    /// The longest possible length for a fuzzy hash signature (without the filename)
    public static final int FUZZY_MAX_RESULT = (SPAMSUM_LENGTH + (SPAMSUM_LENGTH / 2 + 20));

    public static final int MIN_BLOCKSIZE = 3;
    public static final int ROLLING_WINDOW = 7;

    public static final int HASH_PRIME = 0x01000193;
    public static final int HASH_INIT = 0x28021967;

    // Our input buffer when reading files to hash
    public static final int BUFFER_SIZE = 8192;

    static class roll_state_class {
        int[] window = new int[ROLLING_WINDOW];
        int h1, h2, h3;
        int n;
    }

    private static roll_state_class roll_state = new roll_state_class();

    /*
      a rolling hash, based on the Adler checksum. By using a rolling hash
      we can perform auto resynchronisation after inserts/deletes
        
      internally, h1 is the sum of the bytes in the window and h2
      is the sum of the bytes times the index
        
      h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
      we can cope with large blocksize values
    */
    static int roll_hash(int c) {

        //      System.out.println(""+roll_state.h1+","+roll_state.h2+","+roll_state.h3);
        roll_state.h2 -= roll_state.h1;
        //roll_state.h2 = roll_state.h2 & 0x7fffffff;
        roll_state.h2 += ROLLING_WINDOW * c;
        //roll_state.h2 = roll_state.h2 & 0x7fffffff;

        roll_state.h1 += c;
        //roll_state.h1 = roll_state.h1 & 0x7fffffff;
        roll_state.h1 -= roll_state.window[(roll_state.n % ROLLING_WINDOW)];
        //roll_state.h1 = roll_state.h1 & 0x7fffffff;

        roll_state.window[roll_state.n % ROLLING_WINDOW] = (char) c;
        roll_state.n = (roll_state.n + 1) % ROLLING_WINDOW;

        /* The original spamsum AND'ed this value with 0xFFFFFFFF which
           in theory should have no effect. This AND has been removed 
           for performance (jk) */
        roll_state.h3 = (roll_state.h3 << 5);// & 0xFFFFFFFF;
        roll_state.h3 ^= c;
        //roll_state.h3 = roll_state.h3 & 0x7FFFFFFF;
        //if( roll_state.h3 > 0xEFFFFFFF ) roll_state.h3 -= 0xEFFFFFFF;

        long result = ((roll_state.h1 + roll_state.h2 + roll_state.h3));//&0x7FFFFFFF;
        //System.out.println("Result: "+result);
        //System.out.println("Result2: "+(result&0xFFFFFFFF));
        //System.out.println("Result3: "+(result&0x7FFFFFFF));

        return (int) result;//&0xFFFFFFFF;
    }

    /*
      reset the state of the rolling hash and return the initial rolling hash value
    */
    static void roll_reset() {
        roll_state.h1 = 0;
        roll_state.h2 = 0;
        roll_state.h3 = 0;
        roll_state.n = 0;
        Arrays.fill(roll_state.window, (char) 0);
    }

    /* a simple non-rolling hash, based on the FNV hash */
    static int sum_hash(int c, int h) {
        h *= HASH_PRIME;
        //h = h & 0xFFFFFFFF;
        h ^= c;
        //h = h & 0xFFFFFFFF;
        return h;
    }

    class ss_context {
        char[] ret;
        char[] p;
        long total_chars;
        int h, h2, h3;
        int j, n, i, k;
        int block_size;
        char[] ret2 = new char[SPAMSUM_LENGTH / 2 + 1];
    }

    static void ss_destroy(ss_context ctx) {
        if (ctx.ret != null)
            ctx.ret = null;
        //free(ctx.ret);
    }

    static boolean ss_init(ss_context ctx, File handle) {
        if (ctx == null)
            return true;

        ctx.ret = new char[FUZZY_MAX_RESULT];
        if (ctx.ret == null)
            return true;

        if (handle != null)
            ctx.total_chars = handle.length();

        ctx.block_size = MIN_BLOCKSIZE;
        while (ctx.block_size * SPAMSUM_LENGTH < ctx.total_chars) {
            ctx.block_size = ctx.block_size * 2;
        }

        System.out.println("bs:" + ctx.block_size);

        return false;
    }

    static char[] b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".toCharArray();

    static void ss_engine(ss_context ctx, byte[] buffer, int buffer_size) {
        if (null == ctx || null == buffer)
            return;

        for (int i = 0; i < buffer_size; ++i) {

            /* 
               at each character we update the rolling hash and
               the normal hash. When the rolling hash hits the
               reset value then we emit the normal hash as a
               element of the signature and reset both hashes
            */

            System.out.println("" + ctx.h + "," + ctx.h2 + "," + ctx.h3);
            ctx.h = roll_hash(buffer[i]);// & 0x7FFFFFFF;
            ctx.h2 = sum_hash(buffer[i], ctx.h2);// & 0x7FFFFFFF;
            ctx.h3 = sum_hash(buffer[i], ctx.h3);// & 0x7FFFFFFF;

            if (((0xFFFFFFFFl & ctx.h) % ctx.block_size) == (ctx.block_size - 1)) {
                /* we have hit a reset point. We now emit a
                hash which is based on all chacaters in the
                piece of the message between the last reset
                point and this one */
                ctx.p[ctx.j] = b64[(int) ((ctx.h2 & 0xFFFF) % 64)];
                System.out.println("::" + ctx.j + ":" + new String(ctx.p));
                //         for( char c : ctx.p ) {
                //            System.out.print(c);
                //         }
                //         System.out.println();         
                if (ctx.j < SPAMSUM_LENGTH - 1) {
                    /* we can have a problem with the tail
                       overflowing. The easiest way to
                       cope with this is to only reset the
                       second hash if we have room for
                       more characters in our
                       signature. This has the effect of
                       combining the last few pieces of
                       the message into a single piece */

                    ctx.h2 = HASH_INIT;
                    (ctx.j)++;
                }
            }

            /* this produces a second signature with a block size
               of block_size*2. By producing dual signatures in
               this way the effect of small changes in the message
               size near a block size boundary is greatly reduced. */
            if (((0xFFFFFFFFl & ctx.h) % (ctx.block_size * 2)) == ((ctx.block_size * 2) - 1)) {
                ctx.ret2[ctx.k] = b64[(int) (ctx.h3 & 0xFFFF % 64)];
                if (ctx.k < SPAMSUM_LENGTH / 2 - 1) {
                    ctx.h3 = HASH_INIT;
                    (ctx.k)++;
                }
            }
        }
    }

    static boolean ss_update(ss_context ctx, File handle) throws IOException {
        int bytes_read = 0;
        byte[] buffer;

        if (null == ctx || null == handle)
            return true;

        buffer = new byte[BUFFER_SIZE];
        if (buffer == null)
            return true;

        // snprintf(ctx.ret, 12, "%u:", ctx.block_size);
        ctx.ret = (ctx.block_size + ":").toCharArray();
        // ctx.p = ctx.ret + strlen(ctx.ret);  
        ctx.p = new char[SPAMSUM_LENGTH];

        //memset(ctx.p, 0, SPAMSUM_LENGTH+1);
        Arrays.fill(ctx.p, (char) 0);
        //memset(ctx.ret2, 0, sizeof(ctx.ret2.length));
        Arrays.fill(ctx.ret2, (char) 0);

        ctx.k = ctx.j = 0;
        ctx.h3 = ctx.h2 = HASH_INIT;
        ctx.h = 0;
        roll_reset();

        System.out.println("Opening file:" + handle);
        FileInputStream in = new FileInputStream(handle);
        // while ((bytes_read = fread(buffer,sizeof(byte),BUFFER_SIZE,handle)) > 0)
        while (in.available() > 0) {
            bytes_read = in.read(buffer);
            ss_engine(ctx, buffer, bytes_read);
        }

        if (ctx.h != 0) {
            ctx.p[ctx.j] = b64[(int) ((ctx.h2 & 0xFFFF) % 64)];
            ctx.ret2[ctx.k] = b64[(int) ((ctx.h3 & 0xFFFF) % 64)];
        }

        //  strcat(ctx.p+ctx.j, ":");
        //  strcat(ctx.p+ctx.j, ctx.ret2);
        ctx.ret = (new String(ctx.ret) + new String(ctx.p) + ":" + new String(ctx.ret2)).toCharArray();

        //  free(buffer);
        return false;
    }

    boolean fuzzy_hash_file(File handle) throws IOException {
        ss_context ctx;
        int filepos;
        boolean done = false;

        if (null == handle)
            return true;

        ctx = new ss_context();
        if (ctx == null)
            return true;

        //  filepos = ftello(handle);

        ss_init(ctx, handle);
        System.out.println("bs-pre:" + ctx.block_size);

        while (!done) {
            //  if (fseeko(handle,0,SEEK_SET))
            //    return true;

            ss_update(ctx, handle);

            System.out.println("RESULT:" + new String(ctx.ret));

            // our blocksize guess may have been way off - repeat if necessary
            if (ctx.block_size > MIN_BLOCKSIZE && ctx.j < SPAMSUM_LENGTH / 2)
                ctx.block_size = ctx.block_size / 2;
            else
                done = true;
        }

        System.out.println("bs-post:" + ctx.block_size);
        // strncpy(result,ctx.ret,FUZZY_MAX_RESULT);

        System.out.println("RESULT:" + new String(ctx.ret));

        ss_destroy(ctx);
        //  free(ctx);

        //  if (fseeko(handle,filepos,SEEK_SET))
        //      return true;

        return false;
    }

    public boolean fuzzy_hash_filename(String filename) throws IOException {
        boolean status;

        if (null == filename)
            return true;

        File handle = new File(filename);//,"rb");
        if (null == handle)
            return true;

        status = fuzzy_hash_file(handle);

        //  fclose(handle);

        return status;
    }

    boolean fuzzy_hash_buf(byte[] buf, int buf_len, char[] result) {
        ss_context ctx = new ss_context();
        boolean done = false;

        if (buf == null)
            return true;

        ctx.total_chars = buf_len;
        ss_init(ctx, null);

        System.out.println("total_chars: " + ctx.total_chars);

        while (!done) {
            //  snprintf(ctx.ret, 12, "%u:", ctx.block_size);
            //  ctx.p = ctx.ret + strlen(ctx.ret);
            ctx.p = new char[SPAMSUM_LENGTH + 1]; // TODO Duplication!

            //  memset(ctx.p, 0, SPAMSUM_LENGTH+1);
            //  memset(ctx.ret2, 0, sizeof(ctx.ret2));

            ctx.k = ctx.j = 0;
            ctx.h3 = ctx.h2 = HASH_INIT;
            ctx.h = 0;
            roll_reset();

            System.out.println("h:" + ctx.h);
            System.out.println("h2:" + ctx.h2);

            ss_engine(ctx, buf, buf_len);

            /* our blocksize guess may have been way off - repeat if necessary */
            if (ctx.block_size > MIN_BLOCKSIZE && ctx.j < SPAMSUM_LENGTH / 2)
                ctx.block_size = ctx.block_size / 2;
            else
                done = true;

            System.out.println("h:" + ctx.h);
            System.out.println("h2:" + ctx.h2);
            System.out.println("h3:" + ctx.h3);
            System.out.println("bs:" + ctx.block_size);
            System.out.println("ret:" + new String(ctx.ret));
            System.out.println("p:" + new String(ctx.p));
            System.out.println("ret2:" + new String(ctx.ret2));
            if (ctx.h != 0) {
                ctx.p[ctx.j] = b64[(int) ((ctx.h2 & 0xFFFF) % 64)];
                ctx.ret2[ctx.k] = b64[(int) ((ctx.h3 & 0xFFFF) % 64)];
            }

            //  strcat(ctx.p+ctx.j, ":");
            //  strcat(ctx.p+ctx.j, ctx.ret2);
        }

        //  strncpy(result,ctx.ret,FUZZY_MAX_RESULT);
        System.out.println("bs:" + ctx.block_size);
        System.out.println("ret:" + new String(ctx.ret));
        System.out.println("p:" + new String(ctx.p));
        System.out.println("ret2:" + new String(ctx.ret2));
        System.out.println("h3:" + ctx.h3);
        result = ctx.ret;

        ss_destroy(ctx);
        //  free(ctx);
        return false;
    }

    /* 
       we only accept a match if we have at least one common substring in
       the signature of length ROLLING_WINDOW. This dramatically drops the
       false positive rate for low score thresholds while having
       negligable affect on the rate of spam detection.
        
       return 1 if the two strings do have a common substring, 0 otherwise
    */
    static int has_common_substring(char[] s1, char[] s2) {
        int i, j;
        int num_hashes;
        long[] hashes = new long[SPAMSUM_LENGTH];

        /* there are many possible algorithms for common substring
           detection. In this case I am re-using the rolling hash code
           to act as a filter for possible substring matches */

        roll_reset();
        //  memset(hashes, 0, sizeof(hashes));

        /* first compute the windowed rolling hash at each offset in
           the first string */
        for (i = 0; s1[i] != 0; i++) {
            hashes[i] = roll_hash((char) s1[i]);
        }
        num_hashes = i;

        roll_reset();

        /* now for each offset in the second string compute the
           rolling hash and compare it to all of the rolling hashes
           for the first string. If one matches then we have a
           candidate substring match. We then confirm that match with
           a direct string comparison */
        for (i = 0; s2[i] != 0; i++) {
            long h = roll_hash((char) s2[i]);
            if (i < ROLLING_WINDOW - 1)
                continue;
            for (j = ROLLING_WINDOW - 1; j < num_hashes; j++) {
                if (hashes[j] != 0 && hashes[j] == h) {
                    /* we have a potential match - confirm it */
                    /*FIXME
                    if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW && 
                    strncmp(s2+i-(ROLLING_WINDOW-1), 
                     s1+j-(ROLLING_WINDOW-1), 
                     ROLLING_WINDOW) == 0) 
                    {
                    return 1;
                    }
                    */
                }
            }
        }

        return 0;
    }

    // eliminate sequences of longer than 3 identical characters. These
    // sequences contain very little information so they tend to just bias
    // the result unfairly
    static char[] eliminate_sequences(String string) {
        char[] str = string.toCharArray();
        StringBuffer ret = new StringBuffer();

        // Do not include repeats:
        for (int i = 3; i < str.length; i++) {
            if (str[i] != str[i - 1] || str[i] != str[i - 2] || str[i] != str[i - 3]) {
                ret.append(str[i]);
            }
        }

        return ret.toString().toCharArray();
    }

    /*
      this is the low level string scoring algorithm. It takes two strings
      and scores them on a scale of 0-100 where 0 is a terrible match and
      100 is a great match. The block_size is used to cope with very small
      messages.
    */
    static int score_strings(char[] s1, char[] s2, int block_size) {
        int score = 0;
        int len1, len2;

        len1 = s1.length;
        len2 = s2.length;

        if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
            /* not a real spamsum signature? */
            return 0;
        }

        /* the two strings must have a common substring of length
           ROLLING_WINDOW to be candidates */
        if (has_common_substring(s1, s2) == 0) {
            return 0;
        }

        /* compute the edit distance between the two strings. The edit distance gives
           us a pretty good idea of how closely related the two strings are */
        score = StringUtils.getLevenshteinDistance(new String(s1), new String(s2));

        /* scale the edit distance by the lengths of the two
           strings. This changes the score to be a measure of the
           proportion of the message that has changed rather than an
           absolute quantity. It also copes with the variability of
           the string lengths. */
        score = (score * SPAMSUM_LENGTH) / (len1 + len2);

        /* at this stage the score occurs roughly on a 0-64 scale,
         * with 0 being a good match and 64 being a complete
         * mismatch */

        /* rescale to a 0-100 scale (friendlier to humans) */
        score = (100 * score) / 64;

        /* it is possible to get a score above 100 here, but it is a
           really terrible match */
        if (score >= 100)
            return 0;

        /* now re-scale on a 0-100 scale with 0 being a poor match and
           100 being a excellent match. */
        score = 100 - score;

        //  printf ("len1: %"PRIu32"  len2: %"PRIu32"\n", len1, len2);

        /* when the blocksize is small we don't want to exaggerate the match size */
        if (score > block_size / MIN_BLOCKSIZE * Math.min(len1, len2)) {
            score = block_size / MIN_BLOCKSIZE * Math.min(len1, len2);
        }
        return score;
    }

    /*
      given two spamsum strings return a value indicating the degree to which they match.
    */
    int fuzzy_compare(FuzzyHash fh1, FuzzyHash fh2) {
        int score = 0;
        char[] s1_1, s1_2;
        char[] s2_1, s2_2;

        // if the blocksizes don't match then we are comparing
        // apples to oranges. This isn't an 'error' per se. We could
        // have two valid signatures, but they can't be compared. 
        if (fh1.blocksize != fh2.blocksize && fh1.blocksize != fh2.blocksize * 2
                && fh2.blocksize != fh1.blocksize * 2) {
            return 0;
        }

        // there is very little information content is sequences of
        // the same character like 'LLLLL'. Eliminate any sequences
        // longer than 3. This is especially important when combined
        // with the has_common_substring() test below. 
        s1_1 = eliminate_sequences(fh1.hash + 1);
        s2_1 = eliminate_sequences(fh2.hash + 1);

        s1_2 = eliminate_sequences(fh1.hash2 + 1);
        s2_2 = eliminate_sequences(fh1.hash2 + 1);

        // each signature has a string for two block sizes. We now
        // choose how to combine the two block sizes. We checked above
        // that they have at least one block size in common 
        if (fh1.blocksize == fh2.blocksize) {
            int score1, score2;
            score1 = score_strings(s1_1, s2_1, fh1.blocksize);
            score2 = score_strings(s1_2, s2_2, fh2.blocksize);

            //    s.block_size = fh1.blocksize;

            score = Math.max(score1, score2);
        } else if (fh1.blocksize == fh2.blocksize * 2) {

            score = score_strings(s1_1, s2_2, fh1.blocksize);
            //    s.block_size = fh1.blocksize;
        } else {

            score = score_strings(s1_2, s2_1, fh2.blocksize);
            //    s.block_size = fh2.blocksize;
        }

        return (int) score;
    }

    /**
     * Main class for quick testing.
     * @param args
     * @throws IOException 
     */
    public static void main(String[] args) throws IOException {
        SSDeep ssd = new SSDeep();
        byte[] b2 = "Hello World how are you today...\n".getBytes();
        byte[] b3 = "Helli".getBytes();
        char[] h1 = null;
        boolean t1 = ssd.fuzzy_hash_buf(b2, b2.length, h1);
        System.out.println("Got " + h1);
        ssd.fuzzy_hash_file(new File("test"));
        //ssd.fuzzy_hash_file(new File("pom.xml"));
    }
}