Java tutorial
/* * SPDXVersion: SPDX-1.1 * Creator: Person: Nuno Brito (nuno.brito@triplecheck.de) * Creator: Organization: TripleCheck (contact@triplecheck.de) * Created: 2014-08-05T12:59:50Z * LicenseName: EUPL-1.1-without-appendix * FileName: TokenizerJava_old.java * FileType: SOURCE * FileCopyrightText: <text> Copyright 2014 Nuno Brito, TripleCheck </text> * FileComment: <text> Use this class for generating an output string that we later use for comparing similarities between two Java source code files </text> */ package deprecate.compare; import java.io.File; import java.util.ArrayList; import java.util.StringTokenizer; import org.apache.commons.lang3.StringUtils; import structure.GrabJavaMethodsUsingRawInterpretation; import structure.InterfaceToGrabJavaMethods; import deprecate.JavaTokens.JavaMethod_old; import structure.SourceCodeSnippet; import deprecate.TokenResult; import deprecate.TokenResults; import deprecate.TokenSource; import utils.files; import utils.regex; import utils.text; /** * * @author Nuno Brito, 5th of August 2014 in Darmstadt, Germany */ public class TokenizerJava_old { // Settings to adjust the algorithm // minimum size for considering a token comparison as reliable private static final int tokenMinSize = 60, tokenMinPercentage = 94; // where we store our tokens ArrayList<String[]> tokens = new ArrayList(); static ArrayList<TokenResults> analysisOutput = new ArrayList(); // constructor class public TokenizerJava_old() { // initialize our java tokens file final File tokenFile = new File("tokens-java.txt"); if (tokenFile.exists() == false) { System.out.println("CJC63 - Error, file not found: " + tokenFile.getAbsolutePath()); System.exit(-1); } // read each line and parse accordingly final String fileContent = files.readAsString(tokenFile); // create an array with the lines final String[] lines = fileContent.split("\n"); // iterate each line for (final String line : lines) { // avoid comment lines if (line.startsWith("//")) { continue; } // break up the line into an array final String[] token = line.split(" "); // if the array specifies a token use it, otherwise keep it unchanged if (token.length > 1) { // add the new token to be replaced tokens.add(new String[] { token[0], token[1] }); } } } /** * Given a source code, this method will split the code into methods and * will call the processLine method for each line * @param sourceCode */ TokenSource tokenize(final String sourceCode) { // the variable to store the output TokenSource tokenSource = new TokenSource(); // get the array of methods InterfaceToGrabJavaMethods grabber = new GrabJavaMethodsUsingRawInterpretation(); ArrayList<SourceCodeSnippet> methods = grabber.grab(sourceCode); // test if we got a meaninful result if (methods.isEmpty()) { // uncomment the lines below for debugging purposes // System.out.println(sourceCode); // System.out.println("---------------------"); return null; } // keep the stored methods in memory tokenSource.setMethodsOriginal(methods); // now process each method for (final SourceCodeSnippet method : methods) { // the variable where we store the results String result = ""; // the lines for this method String[] lines = method.getText().split("\n"); // now iterate each line for (final String line : lines) { // get the source code line converted to tokens final String tokenizedLine = processLine(line); // if the result was null, continue to the next line if (tokenizedLine == null) { continue; } // get the lines together result = result.concat(tokenizedLine); } // avoid empty results if (result.isEmpty()) { continue; } // remove the empty spaces result = result.replaceAll(" ", ""); // create the tokenized method SourceCodeSnippet tokenMethod = new SourceCodeSnippet(); // get the line where the method starts tokenMethod.setLineStart(method.getLineStart()); // get the line where the method ends tokenMethod.setLineEnd(method.getLineEnd()); // add the tokenized method to the object tokenMethod.setText(result); // add the method to the source code tokenSource.add(tokenMethod); } // all done return tokenSource; } /** * Compute the similarity between two strings and provide a percentage * @param s0 * @param s1 * @return */ public static int percentSimilar(String s0, String s1) { int percentage = (int) (100 - (float) StringUtils.getLevenshteinDistance(s0, s1) * 100 / (float) (s0.length() + s1.length())); return percentage; } /** * Process a line of source code and return the tokenized result * @param line The line to be processed * @return A token string or null if not possible to process */ private String processLine(String line) { // remove the empty lines final String trimmedLine = line.trim(); if (trimmedLine.isEmpty()) { return null; } // part of a comment, no need to continue if (trimmedLine.startsWith("*") || trimmedLine.startsWith("/")) { return null; } // remove comments mixed code on the same line int commentPosition = line.indexOf("//"); if (commentPosition > -1) { line = line.substring(0, commentPosition - 1); } // remove the leading and trailing white spaces // we add a leading space to solve issues identifying keywords on the 0 position line = " " + text.removeLeadingAndTrailingSpaces(line); // replace all the text within quotes with a keyword line = regex.replaceQuotesWithKeyword(line); // replace the variables with a defined keyword line = regex.replaceVariablesWithKeyword(line); // replace all the methods with a keyword line = regex.replaceMethodsWithKeyword(line); // replace the known tokens for (String[] token : tokens) { line = regex.replaceWithKeyword(token[0], "" + token[1], line); } // remove the white spaces line = regex.removeWhiteSpaces(line); // convert the token separator line = line.replaceAll("", " "); // all done return line; } /** * Compares a tokenized source code file against a tokenized source code * file in our archive. * @param source1 The source code that is fresh * @param sourceLibrary The source code in our library * @return The similarities between the tokenized files */ public static TokenResults compare(final TokenSource source1, final TokenSource sourceLibrary) { // prepare the output TokenResults results = new TokenResults(); int higherMatch = 0; // now do a double for to compare N x N methods for (SourceCodeSnippet methodTest : source1.getMethods()) { // compare the new methods for (SourceCodeSnippet methodLibrary : sourceLibrary.getMethods()) { if (methodLibrary.getTokens().length() < tokenMinSize) { continue; } // get the similarity value int similarity = percentSimilar(methodTest.getTokens(), methodLibrary.getTokens()); // set the bar on what is accepted as minimal percentage matching if (similarity < tokenMinPercentage) { continue; } // if(similarity > higherMatch){ // higherMatch = similarity; TokenResult result = new TokenResult(); result.setMethodInTest(methodTest); result.setMethodArchived(methodLibrary); result.setSimilarity(similarity); results.add(result); } } // all done return results; } /** * Used internally for the case when we want to show a line comparing two * source code files. * @param tokenSource The tokenized source * @return Empty if no file was specified */ public static String setFileOutput(TokenSource tokenSource) { String output = ""; if (tokenSource.getFile() != null) { output = utils.text.shortText(tokenSource.getFile().getAbsolutePath(), 40) + ": "; } return output; } /** * Convert a source code file to tokens * @param file on disk * @return the tokens */ public static TokenSource convert(final File file) { final String sourceCode = files.readAsString(file); TokenizerJava_old tokenJava = new TokenizerJava_old(); TokenSource result = tokenJava.tokenize(sourceCode); // why was the line below disabled? result.setFile(file); return result; } /** * Convert the tokens stored on disk onto a token source object * @param tokenCode line from disk * @return the tokenSource object or null if some problem occurred */ public static TokenSource decompress(final String tokenCode) { TokenizerJava_old tokens = new TokenizerJava_old(); TokenSource result = new TokenSource(); // split each method according to tabs //String[] methodTexts = tokenCode.split("\t"); StringTokenizer stringTokenizer = new StringTokenizer(tokenCode, TokenSource.separatorMethod); // iterate and create the token methods //for(String methodText : methodTexts){ while (stringTokenizer.hasMoreTokens()) { // create the token method SourceCodeSnippet method = new SourceCodeSnippet(); final String line = stringTokenizer.nextElement().toString(); int i1 = line.indexOf(TokenSource.separatorData); // feed the token data method.setTokens(line.substring(i1 + 1)); // split the lines data in two lines final String lineData = line.substring(0, i1); final String[] lines = lineData.split("\\.\\."); // set the method lines method.setLineStart(Integer.parseInt(lines[0])); method.setLineEnd(Integer.parseInt(lines[1])); // add it to the result result.add(method); } // all done return result; } /** * @param args the command line arguments */ public static void main(String[] args) { /* P ST VD M( V[] V){ IF( V> V){ V M( V); R;} M V= N M(); V M( TE); V= V;} ------------------- ST VD M(){ M V= N M( V M(),##); FL S V= V M( V); M V= N M(); V M( V);} ------------------- ST VD M(){ M V= N M(##); S V= V; V M( V, V M());} */ findMatches(new File("files-minor.java"), new File("test")); } /** * When given a file with source code and a folder containing other source * code files, this method will try to find the best possible matches. * @param file The file to find matches * @param folder The folder where possible matches are located */ public static void findMatches(final File file, final File folder) { // preflight checks if (folder.exists() == false || folder.isDirectory() == false) { System.out.println("TJ404 - Error, folder does not exist: " + folder.getAbsolutePath()); return; } if (file.exists() == false || file.isFile() == false) { System.out.println("TJ410 - Error, file does not exist: " + file.getAbsolutePath()); return; } // now convert the source code file we want to compare TokenSource source1 = convert(file); // clear the results from a previous search analysisOutput.clear(); // process the files that we find on the target folder processFiles(source1, folder, 25); // if we have something to show, show.. if (analysisOutput.isEmpty() == false) { for (TokenResults results : analysisOutput) { for (TokenResult result : results.getResultList()) { System.out .println("" + result.getSimilarity() + "% \t" + result.getMethodArchived().getTokens()); } } } } /** * Crawl a given folder looking for matches to a given source code file. * @param source1 The source code file to compare against * @param folder The folder where other source code files are located * @param maxCrawl The maximum permitted level of subfolders to be crawled */ private static void processFiles(final TokenSource source1, final File folder, final int maxCrawl) { // preflight check if (maxCrawl == 0) { return; } // get a list of files File[] things = folder.listFiles(); // iterate each result for (File thing : things) { // are we looking at a directory if (thing.isDirectory()) { // loop the processing inside the next directory processFiles(source1, thing, maxCrawl - 1); continue; } // get the results from comparing both source codes TokenResults output = processFile(source1, thing); // if null, no valid result was found if (output == null) { continue; } // add the result to the analysis output analysisOutput.add(output); } } /** * Process a specific file * @param source1 The source against we want to compare * @param file A file on disk */ private static TokenResults processFile(final TokenSource source1, final File file) { // convert the target file to tokens TokenSource source = convert(file); // instantiate the class to proceed with comparisons TokenResults result = TokenizerJava_old.compare(source1, source); // no point in continuing if the result was null if (result.isEmpty()) { return null; } // set the file on our archive that was compared result.setTestFileReference(file.getAbsolutePath()); // all done return result; } }