es.ucm.fdi.ac.Analysis.java Source code

Introduction

Here is the source code for es.ucm.fdi.ac.Analysis.java
Source

/**
 * AC - A source-code copy detector
 *
 *     For more information please visit:  http://github.com/manuel-freire/ac
 *
 * ****************************************************************************
 *
 * This file is part of AC, version 2.0
 *
 * AC is free software: you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the
 * Free Software Foundation, either version 3 of the License,
 * or (at your option) any later version.
 *
 * AC is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with AC.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * Analysis.java 
 *
 * 18-Apr-2006: first version (mfreire)
 */

package es.ucm.fdi.ac;

import es.ucm.fdi.ac.extract.FileTreeNode;
import es.ucm.fdi.ac.test.Test;
import es.ucm.fdi.util.XMLSerializable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;

import java.util.HashMap;
import java.util.HashSet;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.input.SAXBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;

/**
 * Instantiates 'test @see{Submission}s' from a source directory 
 * and applies similarity @see{Test}s to them. 
 * Also supports loading results for later analysis, and automatic
 * language recognition (via file extensions...).
 *
 * @author mfreire
 */
public class Analysis implements XMLSerializable {

    private static final Logger log = Logger.getLogger(Analysis.class);

    private static final String VERSION_STRING = "2.0";

    /** Set of sources that are being analyzed. */
    private SourceSet sourceSet;

    /** The submissions being compared. */
    private Submission[] subs;

    /**
     * Submissions by ID.
     */
    private final HashMap<String, Submission> idsToSubs = new HashMap<String, Submission>();

    /** 
     * Currently applied tests. 
     * Results are available in each individual submission
     */
    private final HashSet<Test> appliedTests;

    public interface TokenizerFactory {
        Tokenizer getTokenizerFor(Submission[] subs);
    }

    private static TokenizerFactory tokenizerFactory;

    public static void setTokenizerFactory(TokenizerFactory tokenizerFactory) {
        Analysis.tokenizerFactory = tokenizerFactory;
    }

    /**
     * Minimal initialization
     */
    public Analysis() {
        sourceSet = null;
        appliedTests = new HashSet<>();
        subs = new Submission[0];
    }

    /**
     * Initialize subjects from a FileTreeNode.
     *
     * Allows complex filters and 'virtual' files (using the SourceFileCache).
     * After filtering, the resulting filteredTree is expected to contain, on
     * the first level, one folder per submission (folder name to be used as ID);
     * and under each submission-folder, the files that will be analyzed.
     * 
     * @param sources to load
     * @throws java.io.IOException on error
     */
    public void loadSources(SourceSet sources) throws IOException {
        this.sourceSet = sources;
        FileTreeNode root = sources.getFilteredTree();

        if (root == null) {
            throw new IllegalArgumentException("nothing to analyze");
        }

        HashMap<String, Submission> unique = new HashMap<>();
        idsToSubs.clear();
        int i = 0;
        for (FileTreeNode dn : root.getChildren()) {
            Submission s = new Submission(dn.getLabel(), dn.getPath(), 0);
            log.info("   created sub " + s.getId());
            for (FileTreeNode fn : dn.getLeafChildren()) {
                log.debug("    - " + fn.getFile().getName());
                s.addSource(fn.getFile());
            }

            if (!unique.containsKey(s.getHash())) {
                unique.put(s.getHash(), s);
                s.setInternalId(i++);
            } else {
                Submission p = unique.get(s.getHash());
                log.warn("Detected EXACT duplicate: " + s.getHash() + "\n" + " - " + p.getId() + " ("
                        + p.getOriginalPath() + ")\n" + " - " + s.getId() + " (" + s.getOriginalPath() + ")\n");
            }
        }

        subs = new Submission[unique.size()];
        i = 0;
        for (Submission s : unique.values()) {
            subs[i++] = s;
            idsToSubs.put(s.getId(), s);
        }
    }

    /**
     * Choose the right tokenizer for a given file
     */
    public Tokenizer chooseTokenizer() {
        return tokenizerFactory.getTokenizerFor(subs);
    }

    /**
     * @return true if there are available results for this testKey
     */
    public boolean hasResultsForKey(String testKey) {
        if (subs.length == 0) {
            return false;
        }
        Submission first = subs[0];
        return (first.getData(testKey) != null);
    }

    /**
     * Preprocess files
     */
    public void prepareTest(Test t) {
        t.setProgress(0f);
        t.setCancelled(false);

        t.init(subs);

        for (int i = 0; i < subs.length; i++) {
            if (t.isCancelled()) {
                return;
            }
            try {
                NDC.push("Pre-" + subs[i].getId());
                t.preprocess(subs[i]);
                NDC.pop();
            } catch (Throwable re) {
                throw new RuntimeException("Error during pre-processing " + subs[i].getId(), re);
            }
            t.setProgress(i / (float) subs.length);
        }
        t.setProgress(1f);
    }

    private void applyParallelizedTest(Test t) {
        int nProc = Runtime.getRuntime().availableProcessors();
        int slices[] = calculateSliceSizes(nProc, subs.length);

        t.setProgress(0f);
        t.setCancelled(false);
        float[][] F = new float[subs.length][subs.length];

        // launch all jobs, first one with "monitor" set to "true"
        Thread threads[] = new Thread[nProc];
        SimilarityJob lastJob = null;
        for (int i = 0; i < nProc; i++) {
            lastJob = new SimilarityJob(F, t, slices[i], slices[i + 1], i == 0);
            threads[i] = new Thread(lastJob);
            threads[i].start();
        }

        // join them afterwards
        for (int i = 0; i < nProc; i++) {
            try {
                threads[i].join();
            } catch (InterruptedException ie) {
                log.warn("I hate being interrupted. Test aborted");
                t.setCancelled(true);
            }
        }

        lastJob.end();
    }

    /**
     * Apply a single test
     */
    public void applyTest(Test t) {
        if (t.isIndependentSimilarity()) {
            applyParallelizedTest(t);
            return;
        }

        t.setProgress(0f);
        t.setCancelled(false);

        float[][] F = new float[subs.length][subs.length];
        SimilarityJob job = new SimilarityJob(F, t, 0, subs.length, true);
        job.run();
        job.end();
    }

    /**
     * Return the limits of each slice in which to divide a large matrix-based job.
     * Slice limits correspond to indices, where
     * L[i+1]*L[i+1] - L[i]*L[i] = mSize*mSize/nJobs;
     * @param mSize rows in the square matrix that is to be split up
     * @return an array of nJobs+1 limits
     */
    private static int[] calculateSliceSizes(int nJobs, int mSize) {
        int x = mSize * mSize / nJobs;
        int slices[] = new int[nJobs + 1];
        slices[0] = 0;
        slices[nJobs] = mSize;
        for (int i = 0; i < nJobs - 1; i++) {
            slices[i + 1] = (int) Math.sqrt(x + slices[i] * slices[i]);
        }
        return slices;
    }

    private class SimilarityJob implements Runnable {

        private final float[][] F;
        private final int startIdx, endIdx;
        private final Test t;
        private final boolean monitor;

        public SimilarityJob(float[][] F, Test t, int startIdx, int endIdx, boolean monitor) {
            this.F = F;
            this.t = t;
            this.startIdx = startIdx;
            this.endIdx = endIdx;
            this.monitor = monitor;
        }

        public void run() {
            log.debug("I am a job from " + startIdx + " to " + endIdx);
            int total = (endIdx * endIdx - startIdx * startIdx) / 2;
            for (int i = startIdx, k = 0; i < endIdx; i++) {
                for (int j = 0; j < i; j++, k++) {
                    if (t.isCancelled()) {
                        return;
                    }
                    try {
                        F[i][j] = F[j][i] = t.similarity(subs[i], subs[j]);
                    } catch (Throwable re) {
                        t.setCancelled(true);
                        throw new RuntimeException(
                                "Error comparando " + subs[i].getId() + " con " + subs[j].getId(), re);
                    }
                    if (monitor) {
                        t.setProgress(k / (float) total);
                    }
                }
            }
        }

        /**
         * ends the job for all threads; should only be called on one of them
         */
        public void end() {
            if (!t.isCancelled()) {
                for (int i = 0; i < subs.length; i++) {
                    subs[i].putData(t.getTestKey(), F[i]);
                }
                t.setProgress(1f);
                appliedTests.add(t);
            }
        }
    }

    /**
     * Retrieve an already-run test by key
     * @param key for the test
     * @return the test that was run, or null if not found
     */
    public Test getTestByKey(String key) {
        for (Test t : appliedTests) {
            if (t.getTestKey().equals(key)) {
                return t;
            }
        }
        return null;
    }

    public Submission[] getSubmissions() {
        return subs;
    }

    public HashSet<Test> getAppliedTests() {
        return appliedTests;
    }

    public Submission getSubmission(String id) {
        return idsToSubs.get(id);
    }

    /**
     * Sort results by decreasing similarity
     */
    public Result[] sortTestResults(String testKey) {
        Result[] P = new Result[subs.length * (subs.length - 1) / 2];
        for (int i = 0, k = 0; i < subs.length; i++) {
            float[] F = (float[]) subs[i].getData(testKey);
            for (int j = 0; j < i; j++, k++) {
                P[k] = new Result(subs[i], subs[j], F[j]);
            }
        }
        Arrays.sort(P);
        return P;
    }

    /**
     * Inner class, representing a Result. Good for sorting
     */
    public static class Result implements Comparable {
        private Submission a, b;
        private float d;

        public Result(Submission a, Submission b, float d) {
            this.a = a;
            this.b = b;
            this.d = d;
        }

        public int compareTo(Object o) {
            float f = d - ((Result) o).d;
            return f > 0 ? 1 : (f < 0 ? -1 : 0);
        }

        @Override
        public String toString() {
            return "" + d + " " + a.getId() + " " + b.getId();
        }

        public Submission getA() {
            return a;
        }

        public Submission getB() {
            return b;
        }

        public float getDist() {
            return d;
        }
    }

    /**
     * Saves the analysis
     * @return an element that contains the whole document
     * @throws IOException 
     */
    public Element saveToXML() throws IOException {
        Element root = new Element("analysis");
        root.setAttribute("version", VERSION_STRING);
        root.setAttribute("created", new SimpleDateFormat("dd/MM/yyyy HH:mm").format(new Date()));

        root.addContent(sourceSet.saveToXML());

        Element annotations = new Element("annotations");
        for (Submission s : subs) {
            if (!s.getAnnotations().isEmpty()) {
                annotations.addContent(s.saveToXML());
            }
        }
        root.addContent(annotations);

        Element tests = new Element("tests");
        for (Test t : appliedTests) {
            if (!t.isCancelled()) {
                tests.addContent(t.saveToXML());
            }
        }
        root.addContent(tests);

        return root;
    }

    /**
     * Loads the analysis
     * @param root
     * @throws IOException 
     */
    public void loadFromXML(Element root) throws IOException {
        String version = root.getAttributeValue("version");
        if (!version.equals(VERSION_STRING)) {
            log.warn("Loading from different version (" + version + "); " + " but this program uses save-version "
                    + VERSION_STRING);
        }

        log.info("Loading sources...");
        sourceSet.loadFromXML(root.getChild("sources"));
        loadSources(sourceSet);

        log.info("Loading annotations...");
        for (Element se : root.getChild("annotations").getChildren()) {
            Submission sub = getSubmission(se.getAttributeValue("id"));
            sub.loadFromXML(se);
        }

        log.info("Loading tests...");
        ArrayList<Test> pendingTests = new ArrayList<Test>();
        for (Element te : root.getChild("tests").getChildren()) {
            String tcn = te.getAttributeValue("class");
            try {
                Test t = (Test) getClass().getClassLoader().loadClass(tcn).newInstance();
                t.loadFromXML(te);
                pendingTests.add(t);
            } catch (Exception ex) {
                throw new IOException("Could not load test " + tcn);
            }
        }

        // now, run tests in an order that satisfies dependencies
        while (!pendingTests.isEmpty()) {
            boolean progress = false;
            for (Test candidate : pendingTests) {
                boolean dependenciesMet = false;
                for (String k : candidate.getRequires()) {
                    if (!hasResultsForKey(k)) {
                        log.info("Cannot execute " + candidate.getClass() + " without " + k + ": postponing");
                        dependenciesMet = false;
                        break;
                    }
                }
                if (dependenciesMet) {
                    log.info("Dependencies for " + candidate.getClass() + " satisfied, processing");
                    progress = true;
                    applyTest(candidate);
                    pendingTests.remove(candidate);
                }
            }
            if (!progress) {
                throw new IOException("Impossible to meet dependencies for tests");
            }
        }
    }

    /**
     * Reads an analysis from a file
     *
     * @param f the file to read from
     * @throws IOException on any error (may wrap invalid internal XML errors)
     */
    public void loadFromFile(File f) throws IOException {
        try {
            SAXBuilder builder = new SAXBuilder();
            Document doc = builder.build(f);
            loadFromXML(doc.getRootElement());
        } catch (Exception e) {
            throw new IOException("Error loading from '" + f.getAbsolutePath() + "' xml save file", e);
        }
    }

    /**
     * Saves this analysis to a file
     *
     * @param f the file to write to 
     * @throws java.io.IOException
     */
    public void saveToFile(File f) throws IOException {
        FileOutputStream fos = null;
        try {
            XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat());
            fos = new FileOutputStream(f);
            outputter.output(new Document(saveToXML()), fos);
        } finally {
            if (fos != null) {
                fos.close();
            }
        }
    }
}