com.searchcode.app.util.SearchcodeLib.java Source code

Introduction

Here is the source code for com.searchcode.app.util.SearchcodeLib.java
Source

/*
 * Copyright (c) 2016 Boyter Online Services
 *
 * Use of this software is governed by the Fair Source License included
 * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
 * see the README.md for when this clause will take effect
 *
 * Version 1.3.10
 */

package com.searchcode.app.util;

import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Iterables;
import com.searchcode.app.config.Values;
import com.searchcode.app.dao.Data;
import com.searchcode.app.dto.*;
import com.searchcode.app.service.Singleton;
import com.searchcode.app.dto.FileClassifierResult;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.OptionalDouble;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SearchcodeLib {

    private int MAXSPLITLENGTH = 100000;
    private Pattern MULTIPLEUPPERCASE = Pattern.compile("[A-Z]{2,}");
    private int MINIFIEDLENGTH = Integer.parseInt(Values.DEFAULTMINIFIEDLENGTH);
    public String[] WHITELIST = Properties.getProperties()
            .getProperty(Values.BINARY_WHITE_LIST, Values.DEFAULT_BINARY_WHITE_LIST).split(",");
    public String[] BLACKLIST = Properties.getProperties()
            .getProperty(Values.BINARY_BLACK_LIST, Values.DEFAULT_BINARY_BLACK_LIST).split(",");
    private boolean GUESSBINARY = Boolean
            .parseBoolean(Properties.getProperties().getProperty(Values.GUESS_BINARY, Values.DEFAULT_GUESS_BINARY));
    private boolean ANDMATCH = Boolean.parseBoolean(com.searchcode.app.util.Properties.getProperties()
            .getProperty(Values.AND_MATCH, Values.DEFAULT_AND_MATCH));
    public FileClassifier fileClassifier = null;

    public SearchcodeLib() {
        fileClassifier = new FileClassifier();
    }

    public SearchcodeLib(Data data) {
        this.MINIFIEDLENGTH = Singleton.getHelpers().tryParseInt(
                data.getDataByName(Values.MINIFIEDLENGTH, Values.DEFAULTMINIFIEDLENGTH),
                Values.DEFAULTMINIFIEDLENGTH);
        if (this.MINIFIEDLENGTH <= 0) {
            this.MINIFIEDLENGTH = Integer.parseInt(Values.DEFAULTMINIFIEDLENGTH);
        }
        fileClassifier = new FileClassifier();
    }

    /**
     * Split "intelligently" on anything over 7 characters long
     * if it only contains [a-zA-Z]
     * split based on uppercase String[] r = s.split("(?=\\p{Upper})");
     * add those as additional words to index on
     * so that things like RegexIndexer becomes Regex Indexer
     * split the string by spaces
     * look for anything over 7 characters long
     * if its only [a-zA-Z]
     * split by uppercase
     */
    public String splitKeywords(String contents) {
        if (contents == null) {
            return Values.EMPTYSTRING;
        }

        StringBuilder indexContents = new StringBuilder();

        contents = contents.replaceAll("[^a-zA-Z0-9]", " ");

        // Performance improvement hack
        if (contents.length() > this.MAXSPLITLENGTH) {

            // Add AAA to ensure we dont split the last word if it was cut off
            contents = contents.substring(0, MAXSPLITLENGTH) + "AAA";
        }

        for (String splitContents : contents.split(" ")) {
            if (splitContents.length() >= 7) {
                Matcher m = MULTIPLEUPPERCASE.matcher(splitContents);

                if (!m.find()) {
                    String[] splitStrings = splitContents.split("(?=\\p{Upper})");

                    if (splitStrings.length > 1) {
                        indexContents.append(" ");
                        indexContents.append(StringUtils.join(splitStrings, " "));
                    }
                }
            }
        }

        return indexContents.toString();
    }

    public String findInterestingKeywords(String contents) {
        if (contents == null) {
            return Values.EMPTYSTRING;
        }

        StringBuilder indexContents = new StringBuilder();

        // Performance improvement hack
        if (contents.length() > this.MAXSPLITLENGTH) {

            // Add AAA to ensure we dont split the last word if it was cut off
            contents = contents.substring(0, MAXSPLITLENGTH) + "AAA";
        }

        // Finds versions with words at the front, eg linux2.7.4
        Matcher m = Pattern.compile("[a-z]+(\\d+\\.)?(\\d+\\.)?(\\*|\\d+)").matcher(contents);

        while (m.find()) {
            indexContents.append(" ");
            indexContents.append(m.group());
        }

        return indexContents.toString();
    }

    public String findInterestingCharacters(String contents) {
        if (contents == null) {
            return Values.EMPTYSTRING;
        }

        String replaced = contents.replaceAll("\\w", "");

        StringBuffer stringBuffer = new StringBuffer();
        for (char c : replaced.toCharArray()) {
            stringBuffer.append(c).append(" ");
        }

        return stringBuffer.toString();
    }

    /**
     * List of languages to ignore displaying the cost for
     */
    public boolean languageCostIgnore(String languagename) {

        boolean ignore;

        switch (languagename) {
        case "Unknown":
        case "Text":
        case "JSON":
        case "Markdown":
        case "INI File":
        case "ReStructuredText":
        case "Configuration":
            ignore = true;
            break;
        default:
            ignore = false;
            break;
        }

        return ignore;
    }

    /**
     * Tries to guess the true amount of lines in a code file ignoring those that are blank or comments
     * fairly crude however without resorting to parsing which is slow its good enough for our purposes
     */
    public int countFilteredLines(List<String> codeLines) {
        return codeLines.stream().map(x -> x.trim()).filter(x -> {
            return !(x.startsWith("//") || x.startsWith("#") || x.length() == 0 || x.startsWith("<!--")
                    || x.startsWith("!*") || x.startsWith("--") || x.startsWith("%") || x.startsWith(";")
                    || x.startsWith("*") || x.startsWith("/*"));
        }).toArray().length;
    }

    /**
     * Adds a string into the spelling corrector.
     * TODO move this into the spelling corrector class itself
     */
    public void addToSpellingCorrector(String contents) {
        if (contents == null) {
            return;
        }

        ISpellingCorrector sc = Singleton.getSpellingCorrector();

        // Limit to reduce performance impacts
        if (contents.length() > this.MAXSPLITLENGTH) {
            contents = contents.substring(0, MAXSPLITLENGTH);
        }

        List<String> splitString = Arrays.asList(contents.replaceAll("[^a-zA-Z0-9]", " ").toLowerCase().split(" "));

        // Only the first 10000 to avoid causing too much slow-down
        if (splitString.size() > 10000) {
            splitString = splitString.subList(0, 10000);
        }

        for (String s : splitString) {
            if (s.length() >= 3) {
                sc.putWord(s);
            }
        }
    }

    /**
     * Determine if a List<String> which is used to represent a code file contains a code file that is
     * suspected to be minified. This is for the purposes of excluding it from the index.
     */
    public boolean isMinified(List<String> codeLines, String fileName) {

        String lowerFileName = fileName.toLowerCase();

        for (String extension : this.WHITELIST) {
            if (lowerFileName.endsWith("." + extension)) {
                return false;
            }
        }

        OptionalDouble average = codeLines.stream().map(x -> x.trim().replace(" ", "")).mapToInt(String::length)
                .average();
        if (average.isPresent() && average.getAsDouble() > this.MINIFIEDLENGTH) {
            return true;
        }

        return false;
    }

    /**
     * Determine if a List<String> which is used to represent a code file contains a code file that is
     * suspected to be ascii or non ascii. This is for the purposes of excluding it from the index.
     */
    public BinaryFinding isBinary(List<String> codeLines, String fileName) {
        if (codeLines.isEmpty()) {
            return new BinaryFinding(true, "file is empty");
        }

        String lowerFileName = fileName.toLowerCase();
        // Check against user set whitelist
        for (String extension : this.WHITELIST) {
            if (lowerFileName.endsWith("." + extension)) {
                return new BinaryFinding(false, "appears in extension whitelist");
            }
        }

        // Check against user set blacklist
        for (String extention : this.BLACKLIST) {
            if (lowerFileName.endsWith("." + extention)) {
                return new BinaryFinding(true, "appears in extension blacklist");
            }
        }

        // Check if whitelisted extention IE what we know about
        for (FileClassifierResult fileClassifierResult : fileClassifier.getDatabase()) {
            for (String extention : fileClassifierResult.extensions) {
                if (lowerFileName.endsWith("." + extention)) {
                    return new BinaryFinding(false, "appears in internal extension whitelist");
                }
            }
        }

        // If we aren't meant to guess then assume it isnt binary
        if (this.GUESSBINARY == false) {
            return new BinaryFinding(false, Values.EMPTYSTRING);
        }

        int lines = codeLines.size() < 10000 ? codeLines.size() : 10000;
        double asciiCount = 0;
        double nonAsciiCount = 0;

        for (int i = 0; i < lines; i++) {
            String line = codeLines.get(i);
            for (int j = 0; j < line.length(); j++) {
                if (((int) line.charAt(j)) <= 128) {
                    asciiCount++;
                } else {
                    nonAsciiCount++;
                }
            }
        }

        if (nonAsciiCount == 0) {
            return new BinaryFinding(false, Values.EMPTYSTRING);
        }

        if (asciiCount == 0) {
            return new BinaryFinding(true, "all characters found non-ascii");
        }

        // If only 30% of characters are ascii then its probably binary
        double percent = asciiCount / (asciiCount + nonAsciiCount);

        if (percent < 0.30) {
            return new BinaryFinding(true, "only 30% of characters are non-ascii");
        }

        return new BinaryFinding(false, Values.EMPTYSTRING);
    }

    /**
     * Determines who owns a piece of code weighted by time based on current second (IE time now)
     * NB if a commit is very close to this time it will always win
     */
    public String codeOwner(List<CodeOwner> codeOwners) {
        long currentUnix = System.currentTimeMillis() / 1000L;

        double best = 0;
        String owner = "Unknown";

        for (CodeOwner codeOwner : codeOwners) {
            double age = (currentUnix - codeOwner.getMostRecentUnixCommitTimestamp()) / 60 / 60;
            double calc = codeOwner.getNoLines() / Math.pow((age), 1.8);

            if (calc > best) {
                best = calc;
                owner = codeOwner.getName();
            }
        }

        return owner;
    }

    /**
     * Cleans and formats the code into something that can be indexed by lucene while supporting searches such as
     * i++ matching for(int i=0;i<100;i++;){
     */
    public String codeCleanPipeline(String contents) {
        if (contents == null) {
            return Values.EMPTYSTRING;
        }

        StringBuilder indexContents = new StringBuilder();

        // Change how we replace strings
        // Modify the contents to match strings correctly
        char[] firstReplacements = { '<', '>', ')', '(', '[', ']', '|', '=', ',', ':' };
        for (char c : firstReplacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        char[] otherReplacements = { '.' };
        for (char c : otherReplacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        char[] secondReplacements = { ';', '{', '}', '/' };
        for (char c : secondReplacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        char[] forthReplacements = { '"', '\'' };
        for (char c : forthReplacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        // Now do it for other characters
        char[] replacements = { '\'', '"', '.', ';', '=', '(', ')', '[', ']', '_', ';', '@', '#' };
        for (char c : replacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        char[] thirdReplacements = { '-' };
        for (char c : thirdReplacements) {
            contents = contents.replace(c, ' ');
        }
        indexContents.append(" ").append(contents);

        return indexContents.toString();
    }

    /**
     * Parse the query and escape it as per Lucene but without affecting search operators such as AND OR and NOT
     */
    public String formatQueryString(String query) {
        if (this.ANDMATCH) {
            return this.formatQueryStringAndDefault(query);
        }

        return this.formatQueryStringOrDefault(query);
    }

    public String formatQueryStringAndDefault(String query) {
        String[] split = query.trim().split("\\s+");

        List<String> stringList = new ArrayList<>();

        String and = " AND ";
        String or = " OR ";
        String not = " NOT ";

        for (String term : split) {
            switch (term) {
            case "AND":
                if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(and)) {
                    stringList.add(and);
                }
                break;
            case "OR":
                if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(or)) {
                    stringList.add(or);
                }
                break;
            case "NOT":
                if (Iterables.getLast(stringList, null) != null && !Iterables.getLast(stringList).equals(not)) {
                    stringList.add(not);
                }
                break;
            default:
                if (Iterables.getLast(stringList, null) == null || Iterables.getLast(stringList).equals(and)
                        || Iterables.getLast(stringList).equals(or) || Iterables.getLast(stringList).equals(not)) {
                    stringList.add(" " + QueryParser.escape(term.toLowerCase()).replace("\\(", "(")
                            .replace("\\)", ")").replace("\\*", "*") + " ");
                } else {
                    stringList.add(and + QueryParser.escape(term.toLowerCase()).replace("\\(", "(")
                            .replace("\\)", ")").replace("\\*", "*") + " ");
                }
                break;
            }
        }
        String temp = StringUtils.join(stringList, " ");
        return temp.trim();
    }

    public String formatQueryStringOrDefault(String query) {
        String[] split = query.trim().split("\\s+");

        StringBuilder sb = new StringBuilder();

        String and = " AND ";
        String or = " OR ";
        String not = " NOT ";

        for (String term : split) {
            switch (term) {
            case "AND":
                sb.append(and);
                break;
            case "OR":
                sb.append(or);
                break;
            case "NOT":
                sb.append(not);
                break;
            default:
                sb.append(" ");
                sb.append(QueryParser.escape(term.toLowerCase()).replace("\\(", "(").replace("\\)", ")")
                        .replace("\\*", "*"));
                sb.append(" ");
                break;
            }
        }

        return sb.toString().trim();
    }

    /**
     * Given a query attempts to create alternative queries that should be looser and as such produce more matches
     * or give results where none may exist for the current query.
     */
    public List<String> generateAltQueries(String query) {
        List<String> altQueries = new ArrayList<>();
        query = query.trim().replaceAll(" +", " ");
        String altquery = query.replaceAll("[^A-Za-z0-9 ]", " ").trim().replaceAll(" +", " ");

        if (!altquery.equals(query) && !Values.EMPTYSTRING.equals(altquery)) {
            altQueries.add(altquery);
        }

        altquery = this.splitKeywords(query).trim();
        if (!altquery.equals("") && !altquery.equals(query) && !altQueries.contains(altquery)) {
            altQueries.add(altquery);
        }

        ISpellingCorrector sc = Singleton.getSpellingCorrector();
        StringBuilder stringBuilder = new StringBuilder();
        for (String word : query.replaceAll(" +", " ").split(" ")) {
            if (!word.trim().equals("AND") && !word.trim().equals("OR") && !word.trim().equals("NOT")) {
                stringBuilder.append(" ").append(sc.correct(word));
            }
        }
        altquery = stringBuilder.toString().trim();

        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
            altQueries.add(altquery);
        }

        altquery = query.replace(" AND ", " OR ");
        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
            altQueries.add(altquery);
        }

        altquery = query.replace(" AND ", " ");
        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
            altQueries.add(altquery);
        }

        altquery = query.replace(" NOT ", " ");
        if (!altquery.toLowerCase().equals(query.toLowerCase()) && !altQueries.contains(altquery)) {
            altQueries.add(altquery);
        }

        return altQueries;
    }

    public String generateBusBlurb(ProjectStats projectStats) {

        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("In this repository ").append(projectStats.getRepoFacetOwner().size());

        if (projectStats.getRepoFacetOwner().size() == 1) {
            stringBuffer.append(" committer has contributed to ");
        } else {
            stringBuffer.append(" committers have contributed to ");
        }

        if (projectStats.getTotalFiles() == 1) {
            stringBuffer.append(projectStats.getTotalFiles()).append(" file. ");
        } else {
            stringBuffer.append(projectStats.getTotalFiles()).append(" files. ");
        }

        List<CodeFacetLanguage> codeFacetLanguages = projectStats.getCodeFacetLanguages();

        if (codeFacetLanguages.size() == 1) {
            stringBuffer.append("The most important language in this repository is ")
                    .append(codeFacetLanguages.get(0).getLanguageName()).append(". ");
        } else {
            stringBuffer.append("The most important languages in this repository are ");

            if (!codeFacetLanguages.isEmpty()) {
                if (codeFacetLanguages.size() > 3) {
                    codeFacetLanguages = codeFacetLanguages.subList(0, 3);
                }
                for (int i = 0; i < codeFacetLanguages.size() - 1; i++) {
                    stringBuffer.append(codeFacetLanguages.get(i).getLanguageName()).append(", ");
                }
                stringBuffer.append(" and ")
                        .append(codeFacetLanguages.get(codeFacetLanguages.size() - 1).getLanguageName())
                        .append(". ");
            }
        }

        if (!projectStats.getRepoFacetOwner().isEmpty()) {
            if (projectStats.getRepoFacetOwner().size() < 5) {
                stringBuffer.append("The project has a low bus factor of ")
                        .append(projectStats.getRepoFacetOwner().size());
                stringBuffer.append(" and will be in trouble if ")
                        .append(projectStats.getRepoFacetOwner().get(0).getOwner()).append(" is hit by a bus. ");
            } else if (projectStats.getRepoFacetOwner().size() < 15) {
                stringBuffer.append("The project has bus factor of ")
                        .append(projectStats.getRepoFacetOwner().size()).append(". ");
            } else {
                stringBuffer.append("The project has high bus factor of ")
                        .append(projectStats.getRepoFacetOwner().size()).append(". ");
            }
        }

        List<String> highKnowledge = new ArrayList<>();
        double sumAverageFilesWorked = 0;
        for (CodeFacetOwner codeFacetOwner : projectStats.getRepoFacetOwner()) {
            double currentAverage = (double) codeFacetOwner.getCount() / (double) projectStats.getTotalFiles();
            sumAverageFilesWorked += currentAverage;

            if (currentAverage > 0.1) {
                highKnowledge.add(codeFacetOwner.getOwner());
            }
        }

        int averageFilesWorked = (int) (sumAverageFilesWorked / projectStats.getRepoFacetOwner().size() * 100);

        stringBuffer.append("The average person who commits this project has ownership of ");
        stringBuffer.append(averageFilesWorked).append("% of files. ");

        if (!highKnowledge.isEmpty()) {
            stringBuffer.append("The project relies on the following people; ");
            stringBuffer.append(StringUtils.join(highKnowledge, ", ")).append(". ");
        }

        return stringBuffer.toString().replace(",  and", " and");
    }

    //    /**
    //     * Currently not used but meant to replicate the searchcode.com hash which is used to identify duplicate files
    //     * even when they have a few characters or lines missing. It should in these cases produce identical hashes.
    //     */
    //    public String hash(String contents) {
    //        int hashLength = 20;
    //
    //        if (contents.length() == 0) {
    //            return Strings.padStart("", hashLength, '0');
    //        }
    //
    //        String allowedCharacters = "BCDFGHIJKLMNOPQRSUVWXYZbcdfghijklmnopqrsuvwxyz1234567890";
    //
    //        // remove all spaces
    //        Joiner joiner = Joiner.on("").skipNulls();
    //        String toHash = joiner.join(Splitter.on(' ')
    //                            .trimResults()
    //                            .omitEmptyStrings()
    //                            .split(contents));
    //
    //        // remove all non acceptable characters
    //        for(int i=0; i< toHash.length(); i++) {
    //            char c = toHash.charAt(i);
    //
    //            if (allowedCharacters.indexOf(c) != -1) {
    //                // allowed so keep it
    //            }
    //        }
    //
    //        return "";
    //    }
}