net.lldp.checksims.ChecksimsCommandLine.java Source code

Java tutorial

Introduction

Here is the source code for net.lldp.checksims.ChecksimsCommandLine.java

Source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2014-2015 Nicholas DeMarinis, Matthew Heon, and Dolan Murvihill
 */

package net.lldp.checksims;

import com.google.common.collect.ImmutableMap;

import net.lldp.checksims.algorithm.AlgorithmRegistry;
import net.lldp.checksims.algorithm.preprocessor.CommonCodeLineRemovalPreprocessor;
import net.lldp.checksims.algorithm.preprocessor.PreprocessorRegistry;
import net.lldp.checksims.algorithm.preprocessor.SubmissionPreprocessor;
import net.lldp.checksims.algorithm.similaritymatrix.output.MatrixPrinter;
import net.lldp.checksims.algorithm.similaritymatrix.output.MatrixPrinterRegistry;
import net.lldp.checksims.parse.SubmissionPercentableCalculator;
import net.lldp.checksims.submission.Submission;

import org.apache.commons.cli.*;
import org.apache.commons.collections4.list.SetUniqueList;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.impl.SimpleLogger;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;

import net.lingala.zip4j.core.ZipFile;
import net.lingala.zip4j.exception.ZipException;
import net.lingala.zip4j.model.FileHeader;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Parses Checksims' command-line options.
 *
 * TODO: Consider changing from a static class? Having the CommandLine as an instance variable would greatly simplify
 */
public final class ChecksimsCommandLine {
    private static Logger logs;
    private static Set<File> tempFiles = new HashSet<>();

    private ChecksimsCommandLine() {
    }

    /**
     * @param level Logging level to use. Supported levels are 0 (nonverbose), 1 (verbose), 2 (very verbose)
     * @return Logger with appropriate logging level
     */
    static Logger startLogger(int level) {
        if (level == 1) {
            // Set verbose logging level
            System.setProperty(SimpleLogger.DEFAULT_LOG_LEVEL_KEY, "DEBUG");
        } else if (level == 2) {
            // Set very verbose logging level
            System.setProperty(SimpleLogger.DEFAULT_LOG_LEVEL_KEY, "TRACE");
        } else if (level == 0) {
            System.setProperty(SimpleLogger.DEFAULT_LOG_LEVEL_KEY, "INFO");
        } else {
            throw new RuntimeException("Unrecognized verbosity level passed to startLogger!");
        }

        System.setProperty(SimpleLogger.SHOW_LOG_NAME_KEY, "false");
        System.setProperty(SimpleLogger.SHOW_THREAD_NAME_KEY, "false");
        System.setProperty(SimpleLogger.LEVEL_IN_BRACKETS_KEY, "true");

        return LoggerFactory.getLogger(ChecksimsCommandLine.class);
    }

    /**
     * @param anyRequired Whether any arguments are required
     * @return CLI options used in Checksims
     */
    static Options getOpts(boolean anyRequired) {
        Options opts = new Options();

        Option alg = Option.builder("a").longOpt("algorithm").hasArg().argName("name")
                .desc("algorithm to compare with").build();

        Option token = Option.builder("t").longOpt("token").hasArg().argName("type")
                .desc("tokenization to use for submissions").build();

        Option out = Option.builder("o").longOpt("output").hasArgs().argName("name1[,name2,...]")
                .valueSeparator(',').desc("output format(s) to use, comma-separated if multiple given").build();

        Option ignoreInvalid = Option.builder().longOpt("ignoreInvalid")
                .desc("Do not show the result of submissions that do not parse correctly").build();

        Option file = Option.builder("f").longOpt("file").hasArg().argName("filename")
                .desc("print output to given file").build();

        Option preprocess = Option.builder("p").longOpt("preprocess").hasArgs().argName("name1[,name2,...]")
                .valueSeparator(',').desc("preprocessor(s) to apply, comma-separated if multiple given").build();

        Option jobs = Option.builder("j").longOpt("jobs").hasArg().argName("num").desc("number of threads to use")
                .build();

        Option glob = Option.builder("g").longOpt("glob").hasArg().argName("matchpattern")
                .desc("match pattern to determine files included in submissions").build();

        OptionGroup verbosity = new OptionGroup();
        Option verbose = new Option("v", "verbose", false, "specify verbose output. conflicts with -vv");
        Option doubleVerbose = new Option("vv", "veryverbose", false,
                "specify very verbose output. conflicts with -v");
        verbosity.addOption(verbose);
        verbosity.addOption(doubleVerbose);

        Option help = new Option("h", "help", false, "show usage information");

        Option empty = new Option("e", "empty", false, "retain empty submissions");

        Option common = Option.builder("c").longOpt("common").hasArg().argName("path")
                .desc("directory containing common code which will be removed from all submissions").build();

        Option recursive = new Option("r", "recursive", false,
                "recursively traverse subdirectories to generate submissions");

        Option version = new Option("version", false, "print version of Checksims");

        Option archiveDir = Option.builder("archive").longOpt("archivedir")
                .desc("archive submissions - compared to main submissions but not each other").argName("path")
                .hasArgs().valueSeparator('*').build();

        Option submissionDir = Option.builder("s").longOpt("submissiondir")
                .desc("directory or directories containing submissions to compare - mandatory!").argName("path")
                .hasArgs().valueSeparator('*').build();

        if (anyRequired) {
            submissionDir.setRequired(true);
        }

        opts.addOption(alg);
        opts.addOption(token);
        opts.addOption(out);
        opts.addOption(file);
        opts.addOption(preprocess);
        opts.addOption(jobs);
        opts.addOption(glob);
        opts.addOptionGroup(verbosity);
        opts.addOption(help);
        opts.addOption(empty);
        opts.addOption(common);
        opts.addOption(recursive);
        opts.addOption(version);
        opts.addOption(archiveDir);
        opts.addOption(submissionDir);
        opts.addOption(ignoreInvalid);

        return opts;
    }

    /**
     * Parse a given set of CLI arguments into a Commons CLI CommandLine.
     *
     * @param args Arguments to parse
     * @param anyRequired Whether arguments should be required
     * @return CommandLine from parsed arguments
     * @throws ParseException Thrown on error parsing arguments
     */
    static CommandLine parseOpts(String[] args, boolean anyRequired) throws ParseException {
        checkNotNull(args);

        DefaultParser parser = new DefaultParser();

        // Parse the CLI args
        return parser.parse(getOpts(anyRequired), args);
    }

    /**
     * Print help message.
     */
    static void printHelp() {
        HelpFormatter f = new HelpFormatter();
        PrintWriter systemErr = new PrintWriter(System.err, true);

        f.printHelp(systemErr, 80, "checksims [args]", "checksims: check similarity of student submissions",
                getOpts(true), 2, 4, "");

        System.err.println("\nSupported Similarity Detection Algorithms:");
        AlgorithmRegistry.getInstance().getSupportedImplementationNames().stream()
                .forEach((name) -> System.err.print(name + ", "));
        System.err.println(
                "\nDefault algorithm is " + AlgorithmRegistry.getInstance().getDefaultImplementationName());

        System.err.println("\nSupported Output Strategies:");
        MatrixPrinterRegistry.getInstance().getSupportedImplementationNames().stream()
                .forEach((name) -> System.err.print(name + ", "));
        System.err.println(
                "\nDefault strategy is " + MatrixPrinterRegistry.getInstance().getDefaultImplementationName());

        System.err.println("\nAvailable Preprocessors:");
        PreprocessorRegistry.getInstance().getSupportedImplementationNames().stream()
                .forEach((name) -> System.err.print(name + ", "));
        System.err.println();

        try {
            System.err.println("\nChecksims Version " + ChecksimsRunner.getChecksimsVersion() + "\n\n");
        } catch (ChecksimsException e) {
            System.err.println("Error obtaining version: " + e.getMessage());
        }

        System.exit(0);
    }

    /**
     * Parse basic CLI flags and produce a ChecksimsConfig.
     *
     * @param cli Parsed command line
     * @return Config derived from parsed CLI
     * @throws ChecksimsException Thrown on invalid user input or internal error
     */
    static ChecksimsConfig parseBaseFlags(CommandLine cli) throws ChecksimsException {
        checkNotNull(cli);

        // If we don't have a logger, set one up
        if (logs == null) {
            logs = LoggerFactory.getLogger(ChecksimsCommandLine.class);
        }

        // Create a base config to work from
        ChecksimsConfig config = new ChecksimsConfig();

        // Parse plagiarism detection algorithm
        if (cli.hasOption("a")) {
            config = config.setAlgorithm(
                    AlgorithmRegistry.getInstance().getImplementationInstance(cli.getOptionValue("a")));
            config = config.setTokenization(config.getAlgorithm().getPercentableCalculator());
        }

        // Parse tokenization
        if (cli.hasOption("t")) {
            config = config.setTokenization(SubmissionPercentableCalculator.fromString(cli.getOptionValue("t")));
        }

        // Parse number of threads to use
        if (cli.hasOption("j")) {
            int numThreads = Integer.parseInt(cli.getOptionValue("j"));

            if (numThreads < 1) {
                throw new ChecksimsException("Thread count must be positive!");
            }

            config = config.setNumThreads(numThreads);
        }

        if (cli.hasOption("ignoreInvalid")) {
            config = config.ignoreInvalid();
        }

        // Parse preprocessors
        // Ensure no duplicates
        if (cli.hasOption("p")) {
            List<SubmissionPreprocessor> preprocessors = SetUniqueList.setUniqueList(new ArrayList<>());

            String[] preprocessorsToUse = cli.getOptionValues("p");
            for (String s : preprocessorsToUse) {
                SubmissionPreprocessor p = PreprocessorRegistry.getInstance().getImplementationInstance(s);
                preprocessors.add(p);
            }
            config = config.setPreprocessors(preprocessors);
        }

        // Parse output strategies
        // Ensure no duplicates
        if (cli.hasOption("o")) {
            String[] desiredStrategies = cli.getOptionValues("o");
            Set<String> deduplicatedStrategies = new HashSet<>(Arrays.asList(desiredStrategies));

            if (deduplicatedStrategies.isEmpty()) {
                throw new ChecksimsException("Error: did not obtain a valid output strategy!");
            }

            // Convert to MatrixPrinters
            Set<MatrixPrinter> printers = new HashSet<>();
            for (String name : deduplicatedStrategies) {
                printers.add(MatrixPrinterRegistry.getInstance().getImplementationInstance(name));
            }

            config = config.setOutputPrinters(printers);
        }

        return config;
    }

    private static void addTempFile(File f) {
        tempFiles.add(f);
    }

    public static void deleteTempFiles() {
        Set<File> copy = tempFiles;
        tempFiles = new HashSet<>();
        for (File f : copy) {
            deleteTempRecursive(f);
        }
    }

    private static void deleteTempRecursive(File f) {
        if (f.isDirectory()) {
            for (File ff : f.listFiles()) {
                deleteTempRecursive(ff);
            }
        }
        f.delete();
    }

    private static File recursiveTurninExtraction(File turninZip) throws ZipException, ChecksimsException {
        UUID ran = UUID.randomUUID(); // /tmp/uuid
        String tmpPath = System.getProperty("java.io.tmpdir");

        File unzipLocation = new File(tmpPath, ran.toString());
        if (unzipLocation.mkdir()) {
            ZipFile zip = new ZipFile(turninZip);
            if (zip.isEncrypted()) {
                throw new ChecksimsException("zipfile: " + turninZip.getPath() + " is encrypted");
            }
            @SuppressWarnings("unchecked")
            List<FileHeader> fileHeaders = zip.getFileHeaders();

            File studentsDir = new File(unzipLocation, "students");
            File groupsDir = new File(unzipLocation, "groups");
            studentsDir.mkdirs();
            groupsDir.mkdirs();
            for (FileHeader header : fileHeaders) {
                if (header.getFileName().startsWith("students") || header.getFileName().startsWith("groups")) {
                    zip.extractFile(header, unzipLocation.getPath());
                }
            }

            for (File submissionDir : studentsDir.listFiles()) {
                if (submissionDir.isFile()) {
                    throw new ChecksimsException(
                            "invalid file in turnin directory. might this be an invalid, hand crafted zip file?");
                }

                for (File submission : submissionDir.listFiles()) {
                    if (submission.getAbsolutePath().endsWith(".zip")) {
                        ZipFile submissionZip = new ZipFile(submission);
                        submissionZip.extractAll(submissionDir.getAbsolutePath());
                        submission.delete(); // remove zip file!
                    }
                    if (submission.getAbsolutePath().endsWith(".tar")) {
                        throw new RuntimeException(".tar submissions not accepted yet!");
                    }
                    // TODO: more archive handling,
                    // tar.bz2, tar.gz, .tar, .7z, for starters
                }
            }
            addTempFile(unzipLocation);
            return unzipLocation;
        } else {
            throw new ChecksimsException("canno create " + unzipLocation.getPath());
        }
    }

    /**
     * Extract all ZIP files and add turnin format directories.
     * @param files Set of files to parse.
     * @return Set of files, with turnin files extracted and user/ and group/ directories added.
     * @throws ChecksimsException 
     */
    static Set<File> extractTurninFiles(Set<File> files) throws ChecksimsException {
        Set<File> extracted = new HashSet<>();

        for (File t : files) {
            if (t.isDirectory()) {
                extracted.add(t);
            } else if (t.getAbsolutePath().endsWith(".zip")) {
                try {
                    File turninRoot = recursiveTurninExtraction(t);
                    extracted.add(new File(turninRoot.getAbsolutePath() + File.separatorChar + "groups"));
                    extracted.add(new File(turninRoot.getAbsolutePath() + File.separatorChar + "students"));
                } catch (ZipException e) {
                    throw new ChecksimsException("Input is not a directory or turnin zip file!", e);
                }
            } else {
                throw new ChecksimsException("Input is not a directory or turnin zip file!");
            }
        }

        return extracted;
    }

    /**
     * Parse flags which require submissions to be built.
     *
     * TODO unit tests
     *
     * @param cli Parse CLI options
     * @param baseConfig Base configuration to work off
     * @return Modified baseConfig with submissions (and possibly common code and archive submissions) changed
     * @throws ChecksimsException Thrown on bad argument
     * @throws IOException Thrown on error building submissions
     */
    static ChecksimsConfig parseFileFlags(CommandLine cli, ChecksimsConfig baseConfig)
            throws ChecksimsException, IOException {
        checkNotNull(cli);
        checkNotNull(baseConfig);

        ChecksimsConfig toReturn = new ChecksimsConfig(baseConfig);

        // Get glob match pattern
        // Default to *
        String globPattern = cli.getOptionValue("g", "*");

        // Check if we are recursively building
        boolean recursive = cli.hasOption("r");

        // Check if we are retaining empty submissions
        boolean retainEmpty = cli.hasOption("e");

        // Get submission directories
        if (!cli.hasOption("s")) {
            throw new ChecksimsException("Must provide at least one submission directory!");
        }

        String[] submissionDirsString = cli.getOptionValues("s");

        // Make a Set<File> from those submission directories
        // Map to absolute file, to ensure no dups
        Set<File> submissionDirs = Arrays.stream(submissionDirsString).map(File::new).map(File::getAbsoluteFile)
                .collect(Collectors.toSet());

        if (submissionDirs.isEmpty()) {
            throw new ChecksimsException("Must provide at least one submission directory!");
        }

        // Generate submissions
        Set<Submission> submissions = getSubmissions(submissionDirs, globPattern, recursive, retainEmpty);

        logs.debug("Generated " + submissions.size() + " submissions to process.");

        if (submissions.isEmpty()) {
            throw new ChecksimsException("Could build any submissions to operate on!");
        }

        toReturn = toReturn.setSubmissions(submissions);

        // Check if we need to perform common code removal
        if (cli.hasOption("c")) {
            // Get the directory containing the common code
            String commonCodeDirString = cli.getOptionValue("c");

            List<SubmissionPreprocessor> procs = new ArrayList<>(toReturn.getPreprocessors());
            try {
                procs.add(getCommonCodeRemoval(commonCodeDirString, submissionDirs, globPattern));
            } catch (IOException | ChecksimsException e) {
                logs.debug(e.getMessage());
            }
            toReturn = toReturn.setPreprocessors(procs);
        }

        // Check if we need to add archive directories
        if (cli.hasOption("archive")) {
            String[] archiveDirsString = cli.getOptionValues("archive");

            // Convert them into a set of files, again using getAbsoluteFile
            Set<File> archiveDirs = Arrays.stream(archiveDirsString).map(File::new).map(File::getAbsoluteFile)
                    .collect(Collectors.toSet());

            archiveDirs = extractTurninFiles(archiveDirs);

            // Ensure that none of them are also submission directories
            for (File archiveDir : archiveDirs) {
                if (submissionDirs.contains(archiveDir)) {
                    throw new ChecksimsException("Directory is both an archive directory and submission directory: "
                            + archiveDir.getAbsolutePath());
                }
            }

            // Get set of archive submissions
            Set<Submission> archiveSubmissions = getSubmissions(archiveDirs, globPattern, recursive, retainEmpty);

            logs.debug("Generated " + archiveSubmissions.size() + " archive submissions to process");

            if (archiveSubmissions.isEmpty()) {
                logs.warn("Did not find any archive submissions to test with!");
            }

            toReturn = toReturn.setArchiveSubmissions(archiveSubmissions);
        }

        return toReturn;
    }

    public static SubmissionPreprocessor getCommonCodeRemoval(String commonCodeDirString,
            Set<File> submissionAndArchiveDirs, String glob) throws ChecksimsException, IOException {
        // Make a file from it
        File commonCodeDir = new File(commonCodeDirString).getAbsoluteFile();

        // Verify that it's not a submission dir
        if (submissionAndArchiveDirs.contains(commonCodeDir)) {
            throw new ChecksimsException("Common code directory cannot be a submission directory!");
        }

        // All right, parse common code
        Submission commonCodeSubmission = Submission.submissionFromDir(commonCodeDir, glob, true);

        if (!commonCodeSubmission.getContentAsString().isEmpty()) {
            return new CommonCodeLineRemovalPreprocessor(commonCodeSubmission);
        }

        throw new ChecksimsException("Common Code directory is empty");
    }

    /**
     * Build the collection of submissions Checksims will be run on.
     *
     * TODO add unit tests
     *
     * @param submissionDirs Directories to build submissions from
     * @param glob Glob matcher to use when building submissions
     * @param tokenizer Tokenizer to use when building submissions
     * @param recursive Whether to recursively traverse when building submissions
     * @return Collection of submissions which will be used to run Checksims
     * @throws IOException Thrown on issue reading files or traversing directories to build submissions
     */
    public static Set<Submission> getSubmissions(Set<File> submissionDirs, String glob, boolean recursive,
            boolean retainEmpty) throws IOException, ChecksimsException {
        checkNotNull(submissionDirs);
        checkArgument(!submissionDirs.isEmpty(), "Must provide at least one submission directory!");
        checkNotNull(glob);

        submissionDirs = extractTurninFiles(submissionDirs);

        // Generate submissions to work on
        Set<Submission> submissions = new HashSet<>();
        for (File dir : submissionDirs) {
            if (logs != null) {
                logs.debug("Adding directory " + dir.getName());
            }

            submissions.addAll(Submission.submissionListFromDir(dir, glob, recursive));
        }

        // If not retaining empty submissions, filter the empty ones out
        if (!retainEmpty) {
            Set<Submission> submissionsNoEmpty = new HashSet<>();

            for (Submission s : submissions) {
                if (s.getContentAsString().isEmpty()) {
                    if (logs != null) {
                        logs.warn("Discarding empty submission " + s.getName());
                    }
                } else {
                    submissionsNoEmpty.add(s);
                }
            }

            return submissionsNoEmpty;
        }

        return submissions;
    }

    /**
     * Parse CLI arguments and run Checksims from them.
     *
     * TODO add unit tests
     *
     * @param args CLI arguments to parse
     * @throws ParseException Thrown on error parsing CLI arguments
     * @throws ChecksimsException Thrown on invalid CLI arguments or error running Checksims
     * @throws IOException Thrown on error building a submission from files or writing output to file
     */
    public static void runCLI(String[] args) throws ParseException, ChecksimsException, IOException {
        checkNotNull(args);

        // Parse options, first round: nothing required, so we can check for --help and --version
        CommandLine cli = parseOpts(args, false);

        // Print CLI Help
        if (cli.hasOption("h")) {
            printHelp();
        }

        // Print version
        if (cli.hasOption("version")) {
            System.err.println("Checksims version " + ChecksimsRunner.getChecksimsVersion());
            System.exit(0);
        }

        // Parse options, second round: required arguments are required
        cli = parseOpts(args, true);

        // Parse verbose setting
        if (cli.hasOption("vv")) {
            logs = startLogger(2);
        } else if (cli.hasOption("v")) {
            logs = startLogger(1);
        } else {
            logs = startLogger(0);
        }

        // First, parse basic flags
        ChecksimsConfig config = parseBaseFlags(cli);

        // Parse file flags
        ChecksimsConfig finalConfig = parseFileFlags(cli, config);

        // Run Checksims with this config
        ImmutableMap<String, String> output = ChecksimsRunner.runChecksims(finalConfig);

        // Check if file output specified
        if (cli.hasOption("f")) {
            // Writing to a file
            // Get the filename
            String outfileBaseName = cli.getOptionValue("f");

            // Output for all specified strategies
            for (String strategy : output.keySet()) {
                // Final filename is the basename specified through CLI, with the strategy name as its extension.
                File outfile = new File(outfileBaseName + "." + strategy);

                logs.info("Writing " + strategy + " output to " + outfile.getName());

                FileUtils.writeStringToFile(outfile, output.get(strategy), StandardCharsets.UTF_8);
            }
        } else {
            // Just outputting to STDOUT
            for (String strategy : output.keySet()) {
                System.out.println("\n\n");
                System.out.println("Output from " + strategy + "\n");
                System.out.println(output.get(strategy));
            }
        }

        logs.trace("CLI parsing complete!");
    }
}