deprecate.compare.TokenizerJava_old.java Source code

Introduction

Here is the source code for deprecate.compare.TokenizerJava_old.java
Source

/*
 * SPDXVersion: SPDX-1.1
 * Creator: Person: Nuno Brito (nuno.brito@triplecheck.de)
 * Creator: Organization: TripleCheck (contact@triplecheck.de)
 * Created: 2014-08-05T12:59:50Z
 * LicenseName: EUPL-1.1-without-appendix
 * FileName: TokenizerJava_old.java  
 * FileType: SOURCE
 * FileCopyrightText: <text> Copyright 2014 Nuno Brito, TripleCheck </text>
 * FileComment: <text> 
Use this class for generating an output string that we later use for
comparing similarities between two Java source code files
</text> 
 */

package deprecate.compare;

import java.io.File;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.commons.lang3.StringUtils;
import structure.GrabJavaMethodsUsingRawInterpretation;
import structure.InterfaceToGrabJavaMethods;
import deprecate.JavaTokens.JavaMethod_old;
import structure.SourceCodeSnippet;
import deprecate.TokenResult;
import deprecate.TokenResults;
import deprecate.TokenSource;
import utils.files;
import utils.regex;
import utils.text;

/**
 *
 * @author Nuno Brito, 5th of August 2014 in Darmstadt, Germany
 */
public class TokenizerJava_old {

    // Settings to adjust the algorithm

    // minimum size for considering a token comparison as reliable
    private static final int tokenMinSize = 60, tokenMinPercentage = 94;

    // where we store our tokens
    ArrayList<String[]> tokens = new ArrayList();

    static ArrayList<TokenResults> analysisOutput = new ArrayList();

    // constructor class
    public TokenizerJava_old() {
        // initialize our java tokens file
        final File tokenFile = new File("tokens-java.txt");
        if (tokenFile.exists() == false) {
            System.out.println("CJC63 - Error, file not found: " + tokenFile.getAbsolutePath());
            System.exit(-1);
        }

        // read each line and parse accordingly
        final String fileContent = files.readAsString(tokenFile);
        // create an array with the lines
        final String[] lines = fileContent.split("\n");
        // iterate each line
        for (final String line : lines) {
            // avoid comment lines
            if (line.startsWith("//")) {
                continue;
            }
            // break up the line into an array
            final String[] token = line.split(" ");
            // if the array specifies a token use it, otherwise keep it unchanged
            if (token.length > 1) {
                // add the new token to be replaced
                tokens.add(new String[] { token[0], token[1] });
            }
        }
    }

    /**
     * Given a source code, this method will split the code into methods and
     * will call the processLine method for each line
     * @param sourceCode 
     */
    TokenSource tokenize(final String sourceCode) {
        // the variable to store the output
        TokenSource tokenSource = new TokenSource();
        // get the array of methods
        InterfaceToGrabJavaMethods grabber = new GrabJavaMethodsUsingRawInterpretation();
        ArrayList<SourceCodeSnippet> methods = grabber.grab(sourceCode);
        // test if we got a meaninful result
        if (methods.isEmpty()) {
            // uncomment the lines below for debugging purposes
            //            System.out.println(sourceCode);
            //            System.out.println("---------------------");
            return null;
        }
        // keep the stored methods in memory
        tokenSource.setMethodsOriginal(methods);

        // now process each method
        for (final SourceCodeSnippet method : methods) {
            // the variable where we store the results
            String result = "";
            // the lines for this method            
            String[] lines = method.getText().split("\n");

            // now iterate each line
            for (final String line : lines) {
                // get the source code line converted to tokens
                final String tokenizedLine = processLine(line);
                // if the result was null, continue to the next line
                if (tokenizedLine == null) {
                    continue;
                }
                // get the lines together
                result = result.concat(tokenizedLine);
            }
            // avoid empty results
            if (result.isEmpty()) {
                continue;
            }

            // remove the empty spaces
            result = result.replaceAll(" ", "");
            // create the tokenized method
            SourceCodeSnippet tokenMethod = new SourceCodeSnippet();
            // get the line where the method starts
            tokenMethod.setLineStart(method.getLineStart());
            // get the line where the method ends
            tokenMethod.setLineEnd(method.getLineEnd());
            // add the tokenized method to the object
            tokenMethod.setText(result);
            // add the method to the source code
            tokenSource.add(tokenMethod);
        }
        // all done
        return tokenSource;
    }

    /**
     * Compute the similarity between two strings and provide a percentage
     * @param s0
     * @param s1
     * @return 
     */
    public static int percentSimilar(String s0, String s1) {
        int percentage = (int) (100
                - (float) StringUtils.getLevenshteinDistance(s0, s1) * 100 / (float) (s0.length() + s1.length()));
        return percentage;
    }

    /**
     * Process a line of source code and return the tokenized result
     * @param line  The line to be processed
     * @return      A token string or null if not possible to process
     */
    private String processLine(String line) {
        // remove the empty lines
        final String trimmedLine = line.trim();
        if (trimmedLine.isEmpty()) {
            return null;
        }

        // part of a comment, no need to continue
        if (trimmedLine.startsWith("*") || trimmedLine.startsWith("/")) {
            return null;
        }

        // remove comments mixed code on the same line
        int commentPosition = line.indexOf("//");
        if (commentPosition > -1) {
            line = line.substring(0, commentPosition - 1);
        }
        // remove the leading and trailing white spaces
        // we add a leading space to solve issues identifying keywords on the 0 position 
        line = " " + text.removeLeadingAndTrailingSpaces(line);

        // replace all the text within quotes with a keyword
        line = regex.replaceQuotesWithKeyword(line);

        // replace the variables with a defined keyword
        line = regex.replaceVariablesWithKeyword(line);

        // replace all the methods with a keyword
        line = regex.replaceMethodsWithKeyword(line);

        // replace the known tokens        
        for (String[] token : tokens) {
            line = regex.replaceWithKeyword(token[0], "" + token[1], line);
        }

        // remove the white spaces
        line = regex.removeWhiteSpaces(line);
        // convert the token separator
        line = line.replaceAll("", " ");
        // all done
        return line;
    }

    /**
     * Compares a tokenized source code file against a tokenized source code
     * file in our archive.
     * @param source1           The source code that is fresh
     * @param sourceLibrary     The source code in our library
     * @return The similarities between the tokenized files
     */
    public static TokenResults compare(final TokenSource source1, final TokenSource sourceLibrary) {
        // prepare the output
        TokenResults results = new TokenResults();

        int higherMatch = 0;

        // now do a double for to compare N x N methods
        for (SourceCodeSnippet methodTest : source1.getMethods()) {
            // compare the new methods
            for (SourceCodeSnippet methodLibrary : sourceLibrary.getMethods()) {
                if (methodLibrary.getTokens().length() < tokenMinSize) {
                    continue;
                }
                // get the similarity value
                int similarity = percentSimilar(methodTest.getTokens(), methodLibrary.getTokens());

                // set the bar on what is accepted as minimal percentage matching
                if (similarity < tokenMinPercentage) {
                    continue;
                }

                //                if(similarity > higherMatch){
                //                    higherMatch = similarity;

                TokenResult result = new TokenResult();
                result.setMethodInTest(methodTest);
                result.setMethodArchived(methodLibrary);
                result.setSimilarity(similarity);
                results.add(result);
            }
        }
        // all done
        return results;
    }

    /**
     * Used internally for the case when we want to show a line comparing two
     * source code files.
     * @param tokenSource   The tokenized source
     * @return              Empty if no file was specified
     */
    public static String setFileOutput(TokenSource tokenSource) {
        String output = "";
        if (tokenSource.getFile() != null) {
            output = utils.text.shortText(tokenSource.getFile().getAbsolutePath(), 40) + ": ";
        }
        return output;
    }

    /**
     * Convert a source code file to tokens
     * @param file  on disk
     * @return      the tokens
     */
    public static TokenSource convert(final File file) {
        final String sourceCode = files.readAsString(file);
        TokenizerJava_old tokenJava = new TokenizerJava_old();
        TokenSource result = tokenJava.tokenize(sourceCode);
        // why was the line below disabled?
        result.setFile(file);
        return result;
    }

    /**
     * Convert the tokens stored on disk onto a token source object
     * @param   tokenCode line from disk
     * @return  the tokenSource object or null if some problem occurred    
     */
    public static TokenSource decompress(final String tokenCode) {
        TokenizerJava_old tokens = new TokenizerJava_old();
        TokenSource result = new TokenSource();
        // split each method according to tabs
        //String[] methodTexts = tokenCode.split("\t");
        StringTokenizer stringTokenizer = new StringTokenizer(tokenCode, TokenSource.separatorMethod);

        // iterate and create the token methods
        //for(String methodText : methodTexts){
        while (stringTokenizer.hasMoreTokens()) {
            // create the token method
            SourceCodeSnippet method = new SourceCodeSnippet();
            final String line = stringTokenizer.nextElement().toString();
            int i1 = line.indexOf(TokenSource.separatorData);
            // feed the token data
            method.setTokens(line.substring(i1 + 1));

            // split the lines data in two lines
            final String lineData = line.substring(0, i1);
            final String[] lines = lineData.split("\\.\\.");
            // set the method lines
            method.setLineStart(Integer.parseInt(lines[0]));
            method.setLineEnd(Integer.parseInt(lines[1]));

            // add it to the result
            result.add(method);
        }
        // all done
        return result;
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        /*
         P ST VD M( V[] V){ IF( V> V){ V M( V); R;} M V= N M(); V M( TE); V= V;}
        -------------------
         ST VD M(){ M V= N M( V M(),##); FL S V= V M( V); M V= N M(); V M( V);}
        -------------------
         ST VD M(){ M V= N M(##); S V= V; V M( V, V M());}
        */

        findMatches(new File("files-minor.java"), new File("test"));

    }

    /**
     * When given a file with source code and a folder containing other source
     * code files, this method will try to find the best possible matches.
     * @param file      The file to find matches
     * @param folder    The folder where possible matches are located
     */
    public static void findMatches(final File file, final File folder) {
        // preflight checks
        if (folder.exists() == false || folder.isDirectory() == false) {
            System.out.println("TJ404 - Error, folder does not exist: " + folder.getAbsolutePath());
            return;
        }

        if (file.exists() == false || file.isFile() == false) {
            System.out.println("TJ410 - Error, file does not exist: " + file.getAbsolutePath());
            return;
        }

        // now convert the source code file we want to compare
        TokenSource source1 = convert(file);

        // clear the results from a previous search
        analysisOutput.clear();
        // process the files that we find on the target folder
        processFiles(source1, folder, 25);

        // if we have something to show, show..
        if (analysisOutput.isEmpty() == false) {
            for (TokenResults results : analysisOutput) {
                for (TokenResult result : results.getResultList()) {
                    System.out
                            .println("" + result.getSimilarity() + "% \t" + result.getMethodArchived().getTokens());
                }
            }
        }
    }

    /**
     * Crawl a given folder looking for matches to a given source code file.
     * @param source1   The source code file to compare against
     * @param folder    The folder where other source code files are located
     * @param maxCrawl  The maximum permitted level of subfolders to be crawled
     */
    private static void processFiles(final TokenSource source1, final File folder, final int maxCrawl) {
        // preflight check
        if (maxCrawl == 0) {
            return;
        }
        // get a list of files
        File[] things = folder.listFiles();

        // iterate each result
        for (File thing : things) {
            // are we looking at a directory
            if (thing.isDirectory()) {
                // loop the processing inside the next directory
                processFiles(source1, thing, maxCrawl - 1);
                continue;
            }
            // get the results from comparing both source codes
            TokenResults output = processFile(source1, thing);
            // if null, no valid result was found
            if (output == null) {
                continue;
            }
            // add the result to the analysis output
            analysisOutput.add(output);

        }
    }

    /**
     * Process a specific file
     * @param source1   The source against we want to compare
     * @param file      A file on disk
     */
    private static TokenResults processFile(final TokenSource source1, final File file) {
        // convert the target file to tokens
        TokenSource source = convert(file);
        // instantiate the class to proceed with comparisons
        TokenResults result = TokenizerJava_old.compare(source1, source);
        // no point in continuing if the result was null
        if (result.isEmpty()) {
            return null;
        }
        // set the file on our archive that was compared
        result.setTestFileReference(file.getAbsolutePath());
        // all done
        return result;
    }

}