gg.pistol.sweeper.core.Analyzer.java Source code

Introduction

Here is the source code for gg.pistol.sweeper.core.Analyzer.java
Source

/*
 * Sweeper - Duplicate file cleaner
 * Copyright (C) 2012 Bogdan Ciprian Pistol
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package gg.pistol.sweeper.core;

import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Multimap;
import gg.pistol.lumberjack.JackLogger;
import gg.pistol.lumberjack.JackLoggerFactory;
import gg.pistol.sweeper.core.Target.Type;
import gg.pistol.sweeper.core.resource.Resource;
import gg.pistol.sweeper.core.resource.ResourceDirectory;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Analyzes a set of targets to find duplicates.
 *
 * <p>The {@link #analyze} and {@link #delete} methods are not thread safe and must be called from the same thread or
 * using synchronization techniques. The {@link #abortAnalysis} and {@link #abortDeletion} methods are thread safe
 * and can be called from any thread.
 *
 * @author Bogdan Pistol
 */
// package private
class Analyzer {

    private final JackLogger log;

    private final HashFunction hashFunction;

    private boolean analyzing;
    private boolean deleting;

    private final AtomicBoolean abortAnalysis;
    private final AtomicBoolean abortDeletion;

    @Nullable
    private SweeperCountImpl count;
    @Nullable
    private TargetImpl rootTarget;

    Analyzer() throws SweeperException {
        try {
            hashFunction = new HashFunction();
        } catch (NoSuchAlgorithmException e) {
            throw new SweeperException(e);
        }
        abortAnalysis = new AtomicBoolean();
        abortDeletion = new AtomicBoolean();
        log = JackLoggerFactory.getLogger(LoggerFactory.getLogger(Analyzer.class));
    }

    /**
     * Compute the analysis.
     *
     * @return the set of all {@link DuplicateGroup}s sorted decreasingly by size.
     */
    NavigableSet<DuplicateGroup> analyze(Collection<? extends Resource> targetResources,
            SweeperOperationListener listener) throws SweeperAbortException {
        Preconditions.checkNotNull(targetResources);
        Preconditions.checkNotNull(listener);
        Preconditions.checkArgument(!targetResources.isEmpty());

        log.trace("Computing the analysis for the resources {}.", targetResources);
        analyzing = true;
        deleting = false;
        abortAnalysis.set(false);
        OperationTrackingListener trackingListener = new OperationTrackingListener(listener);

        // The number of total targets (including the ROOT target) calculated at the beginning (before sizing the targets)
        // by traverseResources().
        MutableInteger totalTargets = new MutableInteger(0);

        rootTarget = traverseResources(targetResources, totalTargets, trackingListener);
        Collection<TargetImpl> sized = computeSize(rootTarget, totalTargets.intValue(), trackingListener);
        Multimap<Long, TargetImpl> sizeDups = filterDuplicateSize(sized);

        computeHash(sizeDups.values(), trackingListener);
        Multimap<String, TargetImpl> hashDups = filterDuplicateHash(sizeDups.values());

        count = computeCount(rootTarget, hashDups);
        NavigableSet<DuplicateGroup> duplicates = createDuplicateGroups(hashDups);
        analyzing = false;
        return duplicates;
    }

    /**
     * Traverse the resources and expand them.
     *
     * @return a root target that wraps the {@code targetResources}</code>
     */
    private TargetImpl traverseResources(Collection<? extends Resource> targetResources,
            MutableInteger totalTargets, OperationTrackingListener listener) throws SweeperAbortException {
        log.trace("Traversing the resources.");
        listener.updateOperation(SweeperOperation.RESOURCE_TRAVERSING);
        TargetImpl root = new TargetImpl(new LinkedHashSet<Resource>(targetResources));
        totalTargets.setValue(1);

        int expandedTargets = expand(root.getChildren(), listener);
        totalTargets.add(expandedTargets);

        listener.operationCompleted();
        return root;
    }

    /**
     * Expand recursively.
     *
     * <p>This method has side effects: if there is an expanded target equal to any of the {@code rootChildren} then that root
     * child will be removed from the {@code rootChildren} collection. This is done to prevent a target from having
     * multiple parents.
     *
     * <p>Example of multiple parent situation: supposing that {@link #analyze} is called with the resource arguments
     * "res1" and "res2", it could be the case that res2 is a descendant of res1:
     *
     * <pre><code>
     * root---res1---dir---res2
     *     \
     *      --res2
     * </code></pre>
     *
     * In this case res2 has two parents: root and dir, to prevent this from happening the "root---res2" child is
     * removed.
     *
     * @return the number of traversed children targets
     */
    private int expand(Collection<TargetImpl> rootChildren, OperationTrackingListener listener)
            throws SweeperAbortException {
        Set<TargetImpl> rootChildrenSet = new HashSet<TargetImpl>(rootChildren);
        Deque<TargetImpl> stack = new LinkedList<TargetImpl>();
        stack.addAll(rootChildren);
        int targetCount = 0;

        while (!stack.isEmpty()) {
            TargetImpl target = stack.pop();
            target.expand(listener);
            targetCount++;

            for (TargetImpl t : target.getChildren()) {
                if (t.getType() != Type.FILE) {
                    stack.push(t);
                } else {
                    targetCount++;
                }

                // resolve the multiple parent situations
                if (rootChildrenSet.contains(t)) {
                    rootChildrenSet.remove(t);
                    rootChildren.remove(t);
                }
            }
            checkAbortFlag();
        }
        return targetCount;
    }

    // package private for testing
    void checkAbortFlag() throws SweeperAbortException {
        if (analyzing && abortAnalysis.get()) {
            log.info("Aborting the analysis.");
            throw new SweeperAbortException();
        }
        if (deleting && abortDeletion.get()) {
            log.info("Aborting the deletion.");
            throw new SweeperAbortException();
        }
    }

    /**
     * Compute the size recursively with progress indication (the maximum progress is specified by
     * the {@code totalTargets} parameter).
     *
     * @return the collection of all the targets with computed size traversed from the {@code root}
     */
    private Collection<TargetImpl> computeSize(TargetImpl root, int totalTargets,
            final OperationTrackingListener listener) throws SweeperAbortException {
        log.trace("Computing the size for {} that has <{}> total sub-targets.", root, totalTargets);
        final Collection<TargetImpl> ret = new ArrayList<TargetImpl>();
        listener.updateOperation(SweeperOperation.SIZE_COMPUTATION);
        listener.setOperationMaxProgress(totalTargets);

        traverseBottomUp(Collections.singleton(root), new TargetVisitorMethod() {
            public void visit(TargetImpl target, int targetIndex) {
                target.computeSize(listener);
                if (target.isSized()) {
                    ret.add(target);
                }
                listener.incrementOperationProgress(targetIndex);
            }
        });

        listener.operationCompleted();
        return ret;
    }

    /*
     * Bottom-up traversal of the tree of targets.
     *
     * For example the following tree has the bottom-up traversal: A, B, C, D, E, F, root.
     *
     *          --E
     *         /
     * root---F---D
     *     \
     *      --C---B
     *         \
     *          --A
     */
    private void traverseBottomUp(Collection<TargetImpl> roots, TargetVisitorMethod visitor)
            throws SweeperAbortException {
        Deque<TargetImpl> stack = new LinkedList<TargetImpl>(); // DFS style stack
        Set<TargetImpl> childrenPushed = new HashSet<TargetImpl>(); // targets with the children pushed on the stack
        stack.addAll(roots);
        int targetIndex = 1; // counter for the n-th visited target

        while (!stack.isEmpty()) {
            TargetImpl target = stack.peek();

            if (!childrenPushed.contains(target)) {
                childrenPushed.add(target);
                for (TargetImpl child : target.getChildren()) {
                    stack.push(child);
                }
            } else {
                visitor.visit(target, targetIndex);
                targetIndex++;
                stack.pop();
            }
            checkAbortFlag();
        }
    }

    /**
     * Select all the targets for which there is at least another target with the same size.
     *
     * <p>The {@link Target.Type#ROOT} target is excluded.
     *
     * @return a multimap with sizes as keys and the targets with that same size as values for the key
     */
    private Multimap<Long, TargetImpl> filterDuplicateSize(Collection<TargetImpl> list)
            throws SweeperAbortException {
        log.trace("Deduplicating the size.");

        Multimap<Long, TargetImpl> sizeDups = filterDuplicates(list, new Function<TargetImpl, Long>() {
            @Nullable
            public Long apply(TargetImpl input) {
                // all the null return values will be ignored
                return input.getType() != Target.Type.ROOT ? input.getSize() : null;
            }
        });

        return sizeDups;
    }

    /**
     * Select all the duplicates from the targets based on a criteria function.
     * If the function returns the same value for two input targets then those targets are considered duplicates (in
     * the context of the criteria function).
     *
     * @return a multimap with function values as keys and the targets that are considered duplicates as values for
     *         the key
     */
    private <T> Multimap<T, TargetImpl> filterDuplicates(Collection<TargetImpl> targets,
            Function<TargetImpl, T> indexFunction) throws SweeperAbortException {

        // Dumping all the targets into the multimap (Multimaps.index() doesn't work because it does not support
        // skipping null function values and also because of checking the abort flag).
        Multimap<T, TargetImpl> map = ArrayListMultimap.create();
        for (TargetImpl target : targets) {
            T key = indexFunction.apply(target);
            if (key != null) { // ignore null values
                map.put(key, target);
            }
            checkAbortFlag();
        }

        // Filtering the targets (Multimaps.filterKeys() and/or Multimaps.filterValues() don't work because of checking
        // the abort flag).
        Multimap<T, TargetImpl> ret = ArrayListMultimap.create();
        for (T key : map.keySet()) {
            checkAbortFlag();
            Collection<TargetImpl> collection = map.get(key);

            // Ignore all the targets that are not duplicates.
            if (collection.size() == 1) {
                continue;
            }

            // Ignore all the targets that are a single child of a directory. In this case the directory will represent
            // the child's content.
            Collection<TargetImpl> values = new ArrayList<TargetImpl>();
            for (TargetImpl target : collection) {
                if (target.getParent() == null || target.getParent().getChildren().size() > 1) {
                    values.add(target);
                }
            }

            if (values.size() > 1) {
                ret.putAll(key, values);
            }
        }
        return ret;
    }

    /**
     * Compute the hash recursively for the specified targets.
     */
    private void computeHash(Collection<TargetImpl> targets, final OperationTrackingListener listener)
            throws SweeperAbortException {
        log.trace("Computing the hash for {} targets.", targets.size());
        listener.updateOperation(SweeperOperation.HASH_COMPUTATION);

        // Filter the targets that are not the children of other targets. All the children targets will have the hash
        // computed recursively from the parent target.
        targets = filterUpperTargets(targets);

        // Compute the total size of the targets to hash for progress tracking purposes.
        long totalHashSize = 0;
        for (TargetImpl target : targets) {
            totalHashSize += target.getSize();
            checkAbortFlag();
        }
        listener.setOperationMaxProgress(totalHashSize);

        traverseBottomUp(targets, getHashVisitorMethod(listener));
        listener.operationCompleted();
    }

    /**
     * Filter the targets that are not the children of other targets.
     *
     * @return the collection of filtered targets
     */
    private Collection<TargetImpl> filterUpperTargets(Collection<TargetImpl> targets) throws SweeperAbortException {
        Set<TargetImpl> set = new HashSet<TargetImpl>();
        Collection<TargetImpl> ret = new ArrayList<TargetImpl>();

        for (TargetImpl target : targets) {
            set.add(target);
            checkAbortFlag();
        }

        for (TargetImpl target : targets) {
            TargetImpl parent = target;

            boolean isUpper = true;
            while ((parent = parent.getParent()) != null) {
                if (set.contains(parent)) {
                    isUpper = false;
                    break;
                }
            }
            if (isUpper) {
                ret.add(target);
            }

            checkAbortFlag();
        }

        return ret;
    }

    private TargetVisitorMethod getHashVisitorMethod(final OperationTrackingListener listener) {
        return new TargetVisitorMethod() {
            long currentSize = 0;

            public void visit(TargetImpl target, int targetIndex) throws SweeperAbortException {
                target.computeHash(hashFunction, listener, abortAnalysis);

                // Keep track of file sizes only as directories only re-hash the hash of their children which should be
                // fast compared to reading I/O operations and hashing of potentially very large files.
                if (target.getType() == Type.FILE) {
                    currentSize += target.getSize();
                    listener.incrementOperationProgress(currentSize);
                }
            }
        };
    }

    /**
     * Select duplicate targets (having the same hash).
     *
     * @return a multimap with hashes as keys and duplicate targets as values for the key
     */
    private Multimap<String, TargetImpl> filterDuplicateHash(Collection<TargetImpl> targets)
            throws SweeperAbortException {
        log.trace("Deduplicating the hash for {} targets.", targets.size());

        Multimap<String, TargetImpl> hashDups = filterDuplicates(targets, new Function<TargetImpl, String>() {
            @Nullable
            public String apply(TargetImpl input) {
                // all the null return values will be ignored
                return input.isHashed() ? input.getHash() : null;
            }
        });

        return hashDups;
    }

    private SweeperCountImpl computeCount(TargetImpl root, Multimap<String, TargetImpl> hashDups)
            throws SweeperAbortException {
        log.trace("Counting {} hash duplicates.", hashDups.size());

        int totalTargets = root.getTotalTargets();
        int totalTargetFiles = root.getTotalTargetFiles();
        long totalSize = root.getSize();

        int duplicateTargets = 0;
        int duplicateTargetFiles = 0;
        long duplicateSize = 0;

        // Filter the upper targets in order to have correct aggregate counting of duplicates. The hashDups can contain
        // targets that are children of other targets.
        Collection<TargetImpl> hashDupUpperTargets = filterUpperTargets(hashDups.values());

        // Group the duplicate targets by hash.
        Multimap<String, TargetImpl> dups = filterDuplicateHash(hashDupUpperTargets);

        for (String key : dups.keySet()) {
            Iterator<TargetImpl> iterator = dups.get(key).iterator();

            // Jump over the first value from a duplicate group because deleting all the others will make this one
            // a non-duplicate.
            iterator.next();

            while (iterator.hasNext()) {
                TargetImpl target = iterator.next();
                duplicateTargets += target.getTotalTargets();
                duplicateTargetFiles += target.getTotalTargetFiles();
                duplicateSize += target.getSize();

                checkAbortFlag();
            }
        }

        SweeperCountImpl count = new SweeperCountImpl(totalTargets, totalTargetFiles, totalSize, duplicateTargets,
                duplicateTargetFiles, duplicateSize);

        return count;
    }

    private NavigableSet<DuplicateGroup> createDuplicateGroups(Multimap<String, TargetImpl> hashDups) {
        log.trace("Duplicate grouping.");

        NavigableSet<DuplicateGroup> ret = new TreeSet<DuplicateGroup>();
        for (String key : hashDups.keySet()) {
            Collection<TargetImpl> values = hashDups.get(key);
            DuplicateGroup dup = new DuplicateGroup(values);
            ret.add(dup);
        }

        return ret;
    }

    void delete(Collection<? extends Target> targets, SweeperOperationListener listener)
            throws SweeperAbortException {
        Preconditions.checkNotNull(targets);
        Preconditions.checkNotNull(listener);
        Preconditions.checkArgument(!targets.isEmpty());

        log.trace("Deleting targets.");
        analyzing = false;
        deleting = true;
        abortDeletion.set(false);
        OperationTrackingListener trackingListener = new OperationTrackingListener(listener);
        trackingListener.updateOperation(SweeperOperation.RESOURCE_DELETION);

        // Remove the possible multiple instances of the same target and get the children of any ROOT target (deleting
        // a ROOT target means to delete its children).
        Set<TargetImpl> targetSet = new LinkedHashSet<TargetImpl>();
        for (Target target : targets) {
            if (target.getType() == Type.ROOT) {
                targetSet.addAll(((TargetImpl) target).getChildren());
            } else {
                targetSet.add((TargetImpl) target);
            }
            checkAbortFlag();
        }

        // Use only the upper targets (deleting an upper target will also delete all of its descendants).
        Collection<TargetImpl> upperTargets = filterUpperTargets(targetSet);

        int totalProgress = 0; // total individual targets to delete
        for (TargetImpl target : upperTargets) {
            // In case of recursive deletion then a more granular progress tracking is possible, otherwise if
            // a directory (with all of its contents) can be deleted in one step then the progress will be more coarse.
            if (target.getType() == Type.DIRECTORY
                    && ((ResourceDirectory) target.getResource()).deleteOnlyEmpty()) {
                totalProgress += target.getTotalTargets();
            } else {
                totalProgress++;
            }
            checkAbortFlag();
        }
        trackingListener.setOperationMaxProgress(totalProgress);
        MutableInteger progress = new MutableInteger(0);

        // The visitor pattern is used for recursive deletion (bottom-up).
        TargetVisitorMethod deleteMethod = getDeleteVisitorMethod(progress, trackingListener);

        for (TargetImpl target : upperTargets) {
            if (target.getType() == Type.DIRECTORY
                    && ((ResourceDirectory) target.getResource()).deleteOnlyEmpty()) {
                traverseBottomUp(Collections.singleton(target), deleteMethod);
            } else {
                // Deletion of a file or a directory that can be deleted in one single step.
                target.delete(trackingListener);
                progress.increment();
                trackingListener.incrementOperationProgress(progress.intValue());
            }
        }

        trackingListener.operationCompleted();
        deleting = false;
    }

    private TargetVisitorMethod getDeleteVisitorMethod(final MutableInteger progress,
            final OperationTrackingListener listener) {
        return new TargetVisitorMethod() {
            public void visit(TargetImpl target, int targetIndex) {
                target.delete(listener);
                progress.increment();
                listener.incrementOperationProgress(progress.intValue());
            }
        };
    }

    /**
     * Abort the analyze operation.
     *
     * <p>This method is thread safe.
     */
    void abortAnalysis() {
        log.trace("Turning on the analysis abort flag.");
        abortAnalysis.set(true);
    }

    /**
     * Abort the delete operation.
     *
     * <p>This method is thread safe.
     */
    void abortDeletion() {
        log.trace("Turning on the deletion abort flag.");
        abortDeletion.set(true);
    }

    @Nullable
    SweeperCountImpl getCount() {
        return count;
    }

    @Nullable
    TargetImpl getRootTarget() {
        return rootTarget;
    }

    /**
     * Visitor pattern interface for hierarchies of targets.
     */
    private static interface TargetVisitorMethod {
        void visit(TargetImpl target, int targetIndex) throws SweeperAbortException;
    }

}