MdDetect.java Source code

Java tutorial

Introduction

Here is the source code for MdDetect.java

Source

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.*;

/*
 *  Copyright (c) Microsoft. All rights reserved. Licensed under the MIT license. See full license at the bottom of this file.
 */
public class MdDetect {

    private static final boolean VERBOSE = false;

    /**
     * Delim for md regex
     */
    public static final String MD_CODE_REGEX = "^[^`]*```.*$";

    /**
     * Our thread runner
     */
    private static ExecutorService executorService;

    /**
     * construct an array of files from string paths
     *
     * @param lines paths to files for processing
     * @return those paths, as java.io.Files
     */
    private static List<File> asFiles(final List<String> lines) {
        return new ArrayList<File>() {
            {
                for (String path : lines)
                    add(new File(path));
            }
        };
    }

    /**
     * Convenience function to grab STDIN
     *
     * @return strings of lines from STDIN
     */
    static List<String> getSysIn() {
        InputStreamReader inputStreamReader = null;
        try {
            // initialize the stream reader
            inputStreamReader = new InputStreamReader(System.in);
            // load it into a buffer
            BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
            // init the out list
            List<String> input = new ArrayList<>();

            String line;
            while (null != ( // iterate lines
            line = bufferedReader.readLine())) {
                input.add(line);
            }

            return input;
        } catch (IOException e) {
            // bail
            throw new RuntimeException(e.getCause());
        } finally {
            if (null != inputStreamReader) {
                IOUtils.closeQuietly(inputStreamReader);
            }
        }
    }

    /**
     * Set up a ThreadPoolExecutor to process files in parallel
     *
     * @param capacity pool size
     */
    private static void initExecutor(int capacity) {
        executorService = new ThreadPoolExecutor(1, // min pool size
                Runtime.getRuntime().availableProcessors() * 4, // max pool shouldn't exceed cores
                2, // how many * how long?
                TimeUnit.SECONDS, // unit of time
                new ArrayBlockingQueue<Runnable>(capacity // initialize the list with a finite capacity
                ));
    }

    /**
     * Create instances of the process function per file
     *
     * @param file the file to process
     * @return nothing
     */
    private static Callable<Void> newProcessFileAction(final File file) {
        return new Callable<Void>() {
            @Override
            public Void call() throws Exception {
                // read the lines of the 'in'-file
                List<String> inLines = FileUtils.readLines(file);

                // queue up the output
                List<String> outLines = new ArrayList<>();

                // flip/flop for blocks
                boolean inBlock = false;

                // line counter
                int ii = 0;
                for (String line : inLines) {
                    if (line.matches(MD_CODE_REGEX)) { // is the line the start of a code block?
                        line = line.trim(); // clean it up

                        // flip the block toggle
                        inBlock = !inBlock;

                        if (inBlock && !isAlreadyAnnotated(line)) {
                            // there needs to be a kind of 'look-ahead' here to figure out what tag to use...
                            // lets do a best effort to figure our what it is
                            // how about we grab the next few lines and so some casual checks
                            List<String> codeBlockLines = new ArrayList<>();

                            // grab the rest of this code block
                            grabCodeBlock(inLines, ii, codeBlockLines);

                            // grab the tag to use
                            String tag = supposeLang(codeBlockLines);

                            // then use it
                            line = line.replace("```", "```" + tag);
                        }
                    }
                    outLines.add(line);
                    ii++; // increment the lines counter
                }

                // write the outlines over the old file....
                FileUtils.writeLines(file, outLines);
                return null;
            }
        };
    }

    private static void grabCodeBlock(List<String> inLines, int ii, List<String> codeBlockLines) {
        for (int jj = ii + 1; jj < inLines.size(); jj++) {
            if (!inLines.get(jj).matches(MD_CODE_REGEX)) {
                codeBlockLines.add(inLines.get(jj));
                continue;
            }
            break;
        }
    }

    private static String[] langCodes = new String[] { "C#", "c#", "java", "js", "javascript", "html", "HTML",
            "VB.net", "vb", "vba", "xml", "XML", "sql", "SQL" };

    /**
     * Checks if a codeblock is already annotated
     *
     * @param line the line to inspect
     * @return true if the line is already annotated, false otherwise
     */
    static boolean isAlreadyAnnotated(String line) {
        for (String langCode : langCodes) {
            if (line.contains(langCode)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Langs Im expecting to encounter
     */
    private enum Lang {
        C_SHARP(0, new String[] { "c#" },
                new String[] { "abstract", "as", "base", "bool", "break", "byte", "case", "catch", "char",
                        "checked", "class", "const", "continue", "decimal", "default", "delegate", "do", "double",
                        "else", "enum", "event", "explicit", "extern", "false", "finally", "fixed", "float", "for",
                        "foreach", "goto", "if", "implicit", "in", "in (generic modifier)", "int", "interface",
                        "internal", "is", "lock", "long", "namespace", "new", "null", "object", "operator", "out",
                        "out (generic modifier)", "override", "params", "private", "protected", "public",
                        "readonly", "ref", "return", "sbyte", "sealed", "short", "sizeof", "stackalloc", "static",
                        "string", "struct", "switch", "this", "throw", "true", "try", "typeof", "uint", "ulong",
                        "unchecked", "unsafe", "ushort", "using", "virtual", "void", "volatile", "while" }),

        HTML(1, new String[] { "html" }, new String[] { "<html>", "<p>", "<span>", "<div>", "<body>", "<h1>",
                "<h2>", "<h3>", "<h4>", "<h5>", "<h6>", "<td>", "<head>", "<title>", "<br />" }),

        VB(2, new String[] { "vb", "VB.net" }, new String[] { "AddHandler", "AddressOf", "Alias", "And", "AndAlso",
                "As", "Boolean", "ByRef", "Byte", "ByVal", "Call", "Case", "Catch", "CBool", "CByte", "CChar",
                "CDate", "CDec", "CDbl", "Char", "CInt", "Class", "CLng", "CObj", "Const", "Continue", "CSByte",
                "CShort", "CSng", "CStr", "CType", "CUInt", "CULng", "CUShort", "Date", "Decimal", "Declare",
                "Default", "Delegate", "Dim", "DirectCast", "Do", "Double", "Each", "Else", "ElseIf", "End",
                "EndIf", "Enum", "Erase", "Error", "Event", "Exit", "False", "Finally", "For", "Friend", "Function",
                "Get", "GetType", "GetXMLNamespace", "Global", "GoSub", "GoTo", "Handles", "If", "If()",
                "Implements", "Imports", "In", "Inherits", "Integer", "Interface", "Is", "IsNot", "Let", "Lib",
                "Like", "Long", "Loop", "Me", "Mod", "Module", "MustInherit", "MustOverride", "MyBase", "MyClass",
                "Namespace", "Narrowing", "New", "Next", "Not", "Nothing", "NotInheritable", "NotOverridable",
                "Object", "Of", "On", "Operator", "Option", "Optional", "Or", "OrElse", "Overloads", "Overridable",
                "Overrides", "ParamArray", "Partial", "Private", "Property", "Protected", "Public", "RaiseEvent",
                "ReadOnly", "ReDim", "REM", "RemoveHandler", "Resume", "Return", "SByte", "Select", "Set",
                "Shadows", "Shared", "Short", "Single", "Static", "Step", "Stop", "String", "Structure", "Sub",
                "SyncLock", "Then", "Throw", "To", "True", "Try", "TryCast", "TypeOf", "Variant", "Wend",
                "UInteger", "ULong", "UShort", "Using", "When", "While", "Widening", "With", "WithEvents",
                "WriteOnly", "Xor", "#Const", "#Else", "#ElseIf", "#End", "#If", }),

        XML(3, new String[] { "XML" }, new String[] {}),

        SQL(4, new String[] { "sql" }, new String[] { "ABSOLUTE", "ACTION", "ADA", "ADD", "ADMIN", "AFTER",
                "AGGREGATE", "ALIAS", "ALL", "ALLOCATE", "ALTER", "AND", "ANY", "ARE", "ARRAY", "AS", "ASC",
                "ASSERTION", "AT", "AUTHORIZATION", "AVG", "BACKUP", "BEFORE", "BEGIN", "BETWEEN", "BINARY", "BIT",
                "BIT_LENGTH", "BLOB", "BOOLEAN", "BOTH", "BREADTH", "BREAK", "BROWSE", "BULK", "BY", "CALL",
                "CASCADE", "CASCADED", "CASE", "CAST", "CATALOG", "CHAR", "CHARACTER", "CHARACTER_LENGTH",
                "CHAR_LENGTH", "CHECK", "CHECKPOINT", "CLASS", "CLOB", "CLOSE", "CLUSTERED", "COALESCE", "COLLATE",
                "COLLATION", "COLUMN", "COMMIT", "COMPLETION", "COMPUTE", "CONNECT", "CONNECTION", "CONSTRAINT",
                "CONSTRAINTS", "CONSTRUCTOR", "CONTAINS", "CONTAINSTABLE", "CONTINUE", "CONVERT", "CORRESPONDING",
                "COUNT", "CREATE", "CROSS", "CUBE", "CURRENT", "CURRENT_DATE", "CURRENT_PATH", "CURRENT_ROLE",
                "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_USER", "CURSOR", "CYCLE", "DATA", "DATABASE", "DATE",
                "DAY", "DBCC", "DEALLOCATE", "DEC", "DECIMAL", "DECLARE", "DEFAULT", "DEFERRABLE", "DEFERRED",
                "DELETE", "DENY", "DEPTH", "DEREF", "DESC", "DESCRIBE", "DESCRIPTOR", "DESTROY", "DESTRUCTOR",
                "DETERMINISTIC", "DIAGNOSTICS", "DICTIONARY", "DISCONNECT", "DISK", "DISTINCT", "DISTRIBUTED",
                "DOMAIN", "DOUBLE", "DROP", "DUMMY", "DUMP", "DYNAMIC", "EACH", "ELSE", "END", "END-EXEC", "EQUALS",
                "ERRLVL", "ESCAPE", "EVERY", "EXCEPT", "EXCEPTION", "EXEC", "EXECUTE", "EXISTS", "EXIT", "EXTERNAL",
                "EXTRACT", "FALSE", "FETCH", "FILE", "FILLFACTOR", "FIRST", "FLOAT", "FOR", "FOREIGN", "FORTRAN",
                "FOUND", "FREE", "FREETEXT", "FREETEXTTABLE", "FROM", "FULL", "FUNCTION", "GENERAL", "GET",
                "GLOBAL", "GO", "GOTO", "GRANT", "GROUP", "GROUPING", "HAVING", "HOLDLOCK", "HOST", "HOUR",
                "IDENTITY", "IDENTITYCOL", "IDENTITY_INSERT", "IF", "IGNORE", "IMMEDIATE", "IN", "INCLUDE", "INDEX",
                "INDICATOR", "INITIALIZE", "INITIALLY", "INNER", "INOUT", "INPUT", "INSENSITIVE", "INSERT", "INT",
                "INTEGER", "INTERSECT", "INTERVAL", "INTO", "IS", "ISOLATION", "ITERATE", "JOIN", "KEY", "KILL",
                "LANGUAGE", "LARGE", "LAST", "LATERAL", "LEADING", "LEFT", "LESS", "LEVEL", "LIKE", "LIMIT",
                "LINENO", "LOAD", "LOCAL", "LOCALTIME", "LOCALTIMESTAMP", "LOCATOR", "LOWER", "MAP", "MATCH", "MAX",
                "MIN", "MINUTE", "MODIFIES", "MODIFY", "MODULE", "MONTH", "NAMES", "NATIONAL", "NATURAL", "NCHAR",
                "NCLOB", "NEW", "NEXT", "NO", "NOCHECK", "NONCLUSTERED", "NONE", "NOT", "NULL", "NULLIF", "NUMERIC",
                "OBJECT", "OCTET_LENGTH", "OF", "OFF", "OFFSETS", "OLD", "ON", "ONLY", "OPEN", "OPENDATASOURCE",
                "OPENQUERY", "OPENROWSET", "OPENXML", "OPERATION", "OPTION", "OR", "ORDER", "ORDINALITY", "OUT",
                "OUTER", "OUTPUT", "OVER", "OVERLAPS", "PAD", "PARAMETER", "PARAMETERS", "PARTIAL", "PASCAL",
                "PATH", "PERCENT", "PLAN", "POSITION", "POSTFIX", "PRECISION", "PREFIX", "PREORDER", "PREPARE",
                "PRESERVE", "PRIMARY", "PRINT", "PRIOR", "PRIVILEGES", "PROC", "PROCEDURE", "PUBLIC", "RAISERROR",
                "READ", "READS", "READTEXT", "REAL", "RECONFIGURE", "RECURSIVE", "REF", "REFERENCES", "REFERENCING",
                "RELATIVE", "REPLICATION", "RESTORE", "RESTRICT", "RESULT", "RETURN", "RETURNS", "REVOKE", "RIGHT",
                "ROLE", "ROLLBACK", "ROLLUP", "ROUTINE", "ROW", "ROWCOUNT", "ROWGUIDCOL", "ROWS", "RULE", "SAVE",
                "SAVEPOINT", "SCHEMA", "SCOPE", "SCROLL", "SEARCH", "SECOND", "SECTION", "SELECT", "SEQUENCE",
                "SESSION", "SESSION_USER", "SET", "SETS", "SETUSER", "SHUTDOWN", "SIZE", "SMALLINT", "SOME",
                "SPACE", "SPECIFIC", "SPECIFICTYPE", "SQL", "SQLCA", "SQLCODE", "SQLERROR", "SQLEXCEPTION",
                "SQLSTATE", "SQLWARNING", "START", "STATE", "STATEMENT", "STATIC", "STATISTICS", "STRUCTURE",
                "SUBSTRING", "SUM", "SYSTEM_USER", "TABLE", "TEMPORARY", "TERMINATE", "TEXTSIZE", "THAN", "THEN",
                "TIME", "TIMESTAMP", "TIMEZONE_HOUR", "TIMEZONE_MINUTE", "TO", "TOP", "TRAILING", "TRAN",
                "TRANSACTION", "TRANSLATE", "TRANSLATION", "TREAT", "TRIGGER", "TRIM", "TRUE", "TRUNCATE",
                "TSEQUAL", "UNDER", "UNION", "UNIQUE", "UNKNOWN", "UNNEST", "UPDATE", "UPDATETEXT", "UPPER",
                "USAGE", "USE", "USER", "USING", "VALUE", "VALUES", "VARCHAR", "VARIABLE", "VARYING", "VIEW",
                "WAITFOR", "WHEN", "WHENEVER", "WHERE", "WHILE", "WITH", "WITHOUT", "WORK", "WRITE", "WRITETEXT",
                "YEAR", "ZONE" }),

        UNKNOWN(5, new String[] { "" }, new String[] {}), JS(6, new String[] { "js", "javascript" },
                new String[] { "abstract", "arguments", "boolean", "break", "byte", "case", "catch", "char",
                        //                        "class",
                        "const", "continue", "debugger", "default", "delete", "do", "double", "else",
                        //                        "enum",
                        "eval",
                        //                        "export",
                        //                        "extends",
                        "false", "final", "finally", "float", "for", "function", "goto", "if", "implements",
                        //                        "import",
                        "in", "instanceof", "int", "interface", "let", "long", "native", "new", "null", "package",
                        "private", "protected", "public", "return", "short", "static",
                        //                        "super",
                        "switch", "synchronized", "this", "throw", "throws", "transient", "true", "try", "typeof",
                        "var", "void", "volatile", "while", "with", "yield" });

        /**
         * pos in score counter
         */
        final int index;

        /**
         * md abbrev
         */
        final String[] abbrev;

        /**
         * keywords
         */
        final String[] tokens;

        Lang(int index, String[] abbrev, String[] tokens) {
            this.index = index;
            this.abbrev = abbrev;
            this.tokens = tokens;
        }

    }

    private static String supposeLang(List<String> codeBlockLines) {
        // calculate a 'score' which naively tries to determine language based on keywords
        // let's log out the code block...
        String codeBlock = null;
        for (String s : codeBlockLines) {
            codeBlock += s;
        }
        String[] tokens = codeBlock.split(" ");

        final int[] scores = new int[Lang.values().length];

        // iterate over words
        for (String token : tokens) {
            // compare to known langs
            for (Lang lang : Lang.values()) {
                if (looksLike(token, lang))
                    scores[lang.index]++;
            }
        }

        // total the scores & pick
        int bestIndex = Lang.UNKNOWN.index;

        int largest = 0;

        boolean uncertain = true;

        for (int ii = 0; ii < scores.length; ii++) {
            if (scores[ii] > largest) {
                largest = scores[ii];
                bestIndex = ii;
                uncertain = false;
            } else if (scores[ii] == largest) {
                // we have a tie - we're uncertain
                uncertain = true;
            }
        }

        if (VERBOSE) {
            System.out.println("Array contents [0] = " + scores[0]);
            System.out.println("Array contents [1] = " + scores[1]);
            System.out.println("Array contents [2] = " + scores[2]);
            System.out.println("Array contents [3] = " + scores[3]);
        }

        if (!uncertain // not uncertain
                && null != prefer // and we should prefer one
                && bestIndex == prefer.index) { // and it matches this bestIndex

            // use this one - NOOP

        } else { // dont
            uncertain = true;
        }

        String appendValue = Lang.values()[uncertain ? Lang.UNKNOWN.index : bestIndex].abbrev[0];

        if (bestIndex != Lang.UNKNOWN.index && VERBOSE)
            System.out.println("Appending tag: " + appendValue);

        return appendValue;
    }

    private static boolean looksLike(String token, Lang lang) {
        return Arrays.asList(lang.tokens).contains(token);
    }

    private static Lang prefer = null; // null preference indicates 'all'

    public static void main(String[] args) throws InterruptedException {
        if (null != args && args.length > 0 && null != args[0]) {
            System.out.println("Parsing lang: " + args[0]);
            try {
                prefer = Lang.valueOf(args[0]);
            } catch (IllegalArgumentException e) {
                System.err.println("Lang switch must be one of:");
                for (Lang lang : Lang.values()) {
                    System.err.println("\t" + lang.name());
                }
                System.exit(1);
            }
        }

        List<File> files = asFiles(getSysIn());

        System.out.println("Processing " + files.size() + " files");

        if (files.size() < 1)
            System.exit(0); // quit if nothing to do

        initExecutor(files.size());

        for (File file : files)
            executorService.submit(newProcessFileAction(file));

        executorService.shutdown();
        executorService.awaitTermination(30, TimeUnit.SECONDS);
    }

}
// *********************************************************
// Copyright (c) Microsoft Corporation
// All rights reserved.
//
// MIT License:
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
// *********************************************************