net.lldp.checksims.algorithm.linesimilarity.LineSimilarityChecker.java Source code

Java tutorial

Introduction

Here is the source code for net.lldp.checksims.algorithm.linesimilarity.LineSimilarityChecker.java

Source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * See LICENSE.txt included in this distribution for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at LICENSE.txt.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright (c) 2014-2015 Nicholas DeMarinis, Matthew Heon, and Dolan Murvihill
 */

package net.lldp.checksims.algorithm.linesimilarity;

import net.lldp.checksims.algorithm.AlgorithmResults;
import net.lldp.checksims.algorithm.InternalAlgorithmError;
import net.lldp.checksims.algorithm.SimilarityDetector;
import net.lldp.checksims.parse.SubmissionPercentableCalculator;
import net.lldp.checksims.parse.token.PercentableTokenListDecorator;
import net.lldp.checksims.parse.token.SubmissionTokenizer;
import net.lldp.checksims.parse.token.Token;
import net.lldp.checksims.parse.token.TokenList;
import net.lldp.checksims.parse.token.TokenType;
import net.lldp.checksims.parse.token.TokenTypeMismatchException;
import net.lldp.checksims.parse.token.tokenizer.Tokenizer;
import net.lldp.checksims.submission.Submission;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.tuple.Pair;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;

import static com.google.common.base.Preconditions.checkNotNull;

/**
 * Implements a line-by-line similarity checker.
 */
public final class LineSimilarityChecker extends SimilarityDetector<PercentableTokenListDecorator> { // TODO LINE
    private static LineSimilarityChecker instance;

    /**
     * Internal class for record-keeping - used to record a line at a specific location in a submission.
     */
    class SubmissionLine {
        public final int lineNum;
        public final Submission submission;

        SubmissionLine(int lineNum, Submission submission) {
            this.lineNum = lineNum;
            this.submission = submission;
        }

        @Override
        public String toString() {
            return "Line " + lineNum + " from submission with name " + submission.getName();
        }
    }

    private LineSimilarityChecker() {
    }

    // Singleton
    public static LineSimilarityChecker getInstance() {
        if (instance == null) {
            instance = new LineSimilarityChecker();
        }

        return instance;
    }

    @Override
    public String getName() {
        return "linecompare";
    }

    @Override
    public SubmissionPercentableCalculator<PercentableTokenListDecorator> getPercentableCalculator() {
        return new SubmissionTokenizer(Tokenizer.getTokenizer(TokenType.LINE));
    }

    /**
     * Detect similarities using line similarity comparator.
     *
     * @param a First submission to check
     * @param b Second submission to check
     * @return Results of the similarity detection
     * @throws TokenTypeMismatchException Thrown comparing two submissions with different token types
     * @throws InternalAlgorithmError Thrown on error obtaining a hash algorithm instance
     */
    @Override
    public AlgorithmResults detectSimilarity(Pair<Submission, Submission> ab, PercentableTokenListDecorator a,
            PercentableTokenListDecorator b) throws TokenTypeMismatchException, InternalAlgorithmError {
        checkNotNull(a);
        checkNotNull(b);

        //TokenList linesA = a.getContentAsTokens();
        //TokenList linesB = b.getContentAsTokens();
        //TokenList finalA = TokenList.cloneTokenList(linesA);
        //TokenList finalB = TokenList.cloneTokenList(linesB);

        /*
        if(!a.getTokenType().equals(b.getTokenType())) {
        throw new TokenTypeMismatchException("Token list type mismatch: submission " + a.getName() + " has type " +
                linesA.type.toString() + ", while submission " + b.getName() + " has type "
                + linesB.type.toString());
        } else
        */

        if (a.equals(b)) {
            a.getDataCopy().stream().forEach((token) -> token.setValid(false));
            b.getDataCopy().stream().forEach((token) -> token.setValid(false));
            return new AlgorithmResults(ab, a, b);
        }

        MessageDigest hasher;

        // Get a hashing instance
        try {
            hasher = MessageDigest.getInstance("SHA-512");
        } catch (NoSuchAlgorithmException e) {
            throw new InternalAlgorithmError("Error instantiating SHA-512 hash algorithm: " + e.getMessage());
        }

        // Create a line database map
        // Per-method basis to ensure we have no mutable state in the class
        Map<String, List<SubmissionLine>> lineDatabase = new HashMap<>();

        // Hash all lines in A, and put them in the lines database
        addLinesToMap(a.getDataCopy(), lineDatabase, ab.getLeft(), hasher);

        // Hash all lines in B, and put them in the lines database
        addLinesToMap(b.getDataCopy(), lineDatabase, ab.getRight(), hasher);

        // Number of matched lines contained in both
        //int identicalLinesA = 0;
        //int identicalLinesB = 0;

        // Check all the keys
        for (String key : lineDatabase.keySet()) {

            // If more than 1 line has the hash...
            if (lineDatabase.get(key).size() != 1) {
                int numLinesA = 0;
                int numLinesB = 0;

                // Count the number of that line in each submission
                for (SubmissionLine s : lineDatabase.get(key)) {
                    if (s.submission.equals(ab.getLeft())) {
                        numLinesA++;
                    } else if (s.submission.equals(ab.getRight())) {
                        numLinesB++;
                    } else {
                        throw new RuntimeException("Unreachable code!");
                    }
                }

                if (numLinesA == 0 || numLinesB == 0) {
                    // Only one of the submissions includes the line - no plagiarism here
                    continue;
                }

                // Set matches invalid
                for (SubmissionLine s : lineDatabase.get(key)) {
                    if (s.submission.equals(ab.getLeft())) {
                        a.getDataCopy().get(s.lineNum).setValid(false);
                    } else if (s.submission.equals(ab.getRight())) {
                        b.getDataCopy().get(s.lineNum).setValid(false);
                    } else {
                        throw new RuntimeException("Unreachable code!");
                    }
                }

                //identicalLinesA += numLinesA;
                //identicalLinesB += numLinesB;
            }
        }

        //int invalTokensA = (int)a.getDataCopy().stream().filter((token) -> !token.isValid()).count();
        //int invalTokensB = (int)b.getDataCopy().stream().filter((token) -> !token.isValid()).count();

        /*
        if(invalTokensA != identicalLinesA) {
        throw new InternalAlgorithmError("Internal error: number of identical tokens (" + identicalLinesA
                + ") does not match number of invalid tokens (" + invalTokensA + ")");
        } else if(invalTokensB != identicalLinesB) {
        throw new InternalAlgorithmError("Internal error: number of identical tokens (" + identicalLinesB
                + ") does not match number of invalid tokens (" + invalTokensB + ")");
        }
        */

        return new AlgorithmResults(ab, a, b);
    }

    void addLinesToMap(TokenList lines, Map<String, List<SubmissionLine>> lineDatabase, Submission submitter,
            MessageDigest hasher) {
        for (int i = 0; i < lines.size(); i++) {
            Token token = lines.get(i);

            String hash = Hex.encodeHexString(hasher.digest(token.getTokenAsString().getBytes()));

            if (lineDatabase.get(hash) == null) {
                lineDatabase.put(hash, new ArrayList<>());
            }

            SubmissionLine line = new SubmissionLine(i, submitter);
            lineDatabase.get(hash).add(line);
        }
    }

    @Override
    public int hashCode() {
        return this.getName().hashCode();
    }

    @Override
    public boolean equals(Object other) {
        return other instanceof LineSimilarityChecker;
    }
}