at.ac.tuwien.inso.subcat.miner.SvnMiner.java Source code

Java tutorial

Introduction

Here is the source code for at.ac.tuwien.inso.subcat.miner.SvnMiner.java

Source

/* SvnMiner.java
 *
 * Copyright (C) 2014 Florian Brosch
 * Copyright (C) 2014 Andreas Mauczka
 *
 * Based on work from Andreas Mauczka
 *
 * This program is developed as part of the research project
 * "Lexical Repository Analyis" which is part of the PhD thesis
 * "Design and evaluation for identification, mapping and profiling
 * of medium sized software chunks" by Andreas Mauczka at
 * INSO - University of Technology Vienna. For questions in regard
 * to the research project contact andreas.mauczka(at)inso.tuwien.ac.at
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License 2.0
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * Author:
 *       Andreas Mauczka <andreas.mauczka(at)inso.tuwien.ac.at>
 */

package at.ac.tuwien.inso.subcat.miner;

import java.io.IOException;
import java.sql.SQLException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.LinkedHashMap;
import java.text.SimpleDateFormat;
import java.text.ParseException;

import at.ac.tuwien.inso.subcat.utility.Reporter;
import at.ac.tuwien.inso.subcat.utility.XmlReader;
import at.ac.tuwien.inso.subcat.utility.XmlReaderException;
import at.ac.tuwien.inso.subcat.utility.Lemmatizer;
import at.ac.tuwien.inso.subcat.model.Commit;
import at.ac.tuwien.inso.subcat.model.Identity;
import at.ac.tuwien.inso.subcat.model.ManagedFile;
import at.ac.tuwien.inso.subcat.model.Model;
import at.ac.tuwien.inso.subcat.model.ModelPool;
import at.ac.tuwien.inso.subcat.model.Project;
import at.ac.tuwien.inso.subcat.model.User;

import org.apache.commons.lang3.StringUtils;
import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.diff.DiffFormatter;
import org.eclipse.jgit.diff.RawTextComparator;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.treewalk.CanonicalTreeParser;
import org.eclipse.jgit.treewalk.EmptyTreeIterator;

public class SvnMiner extends Miner {

    private Reporter reporter;
    private Settings settings;
    private Project project;
    private Model model;
    private ModelPool pool;
    private XmlReader reader;
    private Lemmatizer lemmatizer;
    private boolean stopped;
    private static String ANON_USER = "anon";
    private Map<String, ManagedFile> fileCache;

    public SvnMiner(Settings settings, Project project, ModelPool pool, Reporter reporter) {
        assert (settings != null);
        assert (project != null);
        assert (pool != null);

        this.reporter = reporter;
        this.settings = settings;
        this.project = project;
        this.pool = pool;

        //initialize lemmatizer once - this seems to be memory intensive
        lemmatizer = new Lemmatizer();
        fileCache = new HashMap<String, ManagedFile>();
    }

    //
    // Helper:
    //

    private HashMap<String, Identity> identities = new HashMap<String, Identity>();

    private Identity resolveIdentity(String aUser) throws SQLException {
        //using identities, even though we dont have emails or committers
        String mail = aUser;
        String name = aUser;

        assert (mail != null || name != null);

        String mapKey = (mail != null) ? mail : name;
        name = (name != null) ? name : mail;

        Identity identity = identities.get(mapKey);
        if (identity == null) {
            User user = model.addUser(project, name);
            identity = model.addIdentity(null, Model.CONTEXT_SRC, mail, name, user);
            identities.put(mapKey, identity);
        }

        return identity;
    }

    private class FileStats {
        public String oldPath;
        public ChangeType type;
        public boolean isFile;

        public FileStats(String oldPath, ChangeType type, boolean isFile) {
            this.oldPath = oldPath;
            this.type = type;
            this.isFile = isFile;
        }
    }

    private enum ChangeType {
        ADD, MODIFY, DELETE, RENAME, REPLACE
    }

    private void processCommit(String aAuthor, Date aDate, LinkedHashMap<String, FileStats> pathslist, String aMsg,
            String revision) throws SQLException, IOException {
        //TODO: Errorhandling to indicate that the SVN Log is not complete, e.g. Revisions for files are missing to have a complete file history

        Identity author = resolveIdentity(aAuthor);

        Identity committer = resolveIdentity(aAuthor);

        Date date = aDate;
        String msg = aMsg;

        // Lemmatize input - delivers String List, convert it to string
        msg = StringUtils.join(lemmatizer.lemmatize(msg), ", ");

        int filecount = pathslist.size();

        assert (date != null || msg != null || pathslist != null);

        Commit commit = model.addCommit(revision, project, author, committer, date, msg, filecount, 0, 0);

        Map<String, ManagedFile> commitFileCache = new HashMap();

        for (Map.Entry<String, FileStats> item : pathslist.entrySet()) {
            if (stopped == true) {
                break;
            }

            FileStats stats = item.getValue();
            String path = item.getKey();

            switch (stats.type) {
            case ADD:
                ManagedFile addedFile = model.addManagedFile(project, path);
                model.addFileChange(commit, addedFile, 0, 0, 0, 0, 0);
                fileCache.put(path, addedFile);
                commitFileCache.put(path, addedFile);
                break;

            case DELETE:
                //TODO: Evaluate whether a file may be added and deleted in the same commit and add commitfilecache logic
                //TODO: Identify reason, why SVN logs sometimes contain modifies and deletes to files that do not exist and are not renamed either
                if (!stats.isFile) {
                    removeBulk(path, commit);
                } else {
                    if (fileCache.get(path) != null) {
                        ManagedFile deletedFile = fileCache.get(path);
                        model.addFileDeletion(deletedFile, commit);
                        fileCache.remove(stats.oldPath);
                    } else {
                        reporter.warning(this.getName(),
                                "could not find: " + path + " to delete in Revision " + revision + " @ SvnMiner");
                    }

                }

                break;

            case MODIFY:
                if (stats.isFile) {
                    //TODO: Identify reason, why SVN logs sometimes contain modifies and deletes to files that do not exist and are not renamed either
                    if (fileCache.get(path) != null) {
                        ManagedFile modifiedFile = fileCache.get(path);

                        //Check if file exists in the commit already and skip the file, if it does
                        if (!commitFileCache.containsKey(path)) {
                            model.addFileChange(commit, modifiedFile, 0, 0, 0, 0, 0);
                            commitFileCache.put(path, modifiedFile);
                        }
                    } else {
                        reporter.warning(this.getName(),
                                "could not find: " + path + " to modify in Revision " + revision + " @ SvnMiner");
                    }
                }
                break;

            case RENAME:
                //TODO: SVN ADD and DELETE in the same commit is used as rename in SVN - this differentiation has not been implemented yet

                if (!stats.isFile) {
                    renameBulk(stats.oldPath, path, commit, commitFileCache);
                } else {
                    ManagedFile copyFile = model.addManagedFile(project, path);
                    model.addFileChange(commit, copyFile, 0, 0, 0, 0, 0);
                    fileCache.put(path, copyFile);
                    commitFileCache.put(path, copyFile);
                    //we dont remove during rename in SVN, since RENAME might also mean a copy command
                    //fileCache.remove (stats.oldPath);
                }

                break;

            default:
                assert (false);
            }
        }

        emitTasksProcessed(1);

    }

    public void renameBulk(String oldPath, String newPath, Commit commit, Map<String, ManagedFile> result)
            throws SQLException {

        for (Map.Entry<String, ManagedFile> entry : fileCache.entrySet()) {
            String key = entry.getKey();
            if (null != key && key.length() > 0) {

                int endIndex = key.lastIndexOf("/");

                if (endIndex != -1) {

                    String renamePath = key.substring(0, endIndex);
                    String filename = key.substring(endIndex);

                    if (renamePath.startsWith(oldPath)) {
                        String subDirectory = renamePath.substring(oldPath.length());
                        String newFileName = newPath + subDirectory + filename;
                        ManagedFile addedFile = model.addManagedFile(project, newFileName);

                        model.addFileChange(commit, addedFile, 0, 0, 0, 0, 0);
                        result.put(newFileName, addedFile);
                    }

                }
            }
        }
        for (Map.Entry<String, ManagedFile> entry : result.entrySet()) {
            fileCache.put(entry.getKey(), entry.getValue());
        }

    }

    public void removeBulk(String path, Commit commit) throws SQLException {

        Map<String, ManagedFile> result = new HashMap();
        for (Map.Entry<String, ManagedFile> entry : fileCache.entrySet()) {
            String key = entry.getKey();
            if (null != key && key.length() > 0) {

                int endIndex = key.lastIndexOf("/");

                if (endIndex != -1) {

                    String removePath = key.substring(0, endIndex);
                    String filename = key.substring(endIndex);

                    if (removePath.equals(path)) {

                        ManagedFile deletedFile = fileCache.get(key);
                        assert (deletedFile != null);
                        model.addFileDeletion(deletedFile, commit);

                        result.put(path, deletedFile);
                    }

                }
            }
        }
        for (Map.Entry<String, ManagedFile> entry : result.entrySet()) {
            fileCache.remove(entry.getKey());
        }

    }

    public void parse() throws XmlReaderException, ParseException, IOException, SQLException {
        reader = new XmlReader(settings.srcLocalPath);
        reader.expectStart("log", true);
        reader.acceptStart("log", true);

        while (reader.isStart("logentry")) {
            parseLogentry();
        }
        reader.expectEnd("log", true);
    }

    private void parseLogentry() throws XmlReaderException, ParseException, IOException, SQLException {

        String revision = reader.getAttribute("revision");

        reader.acceptStart("logentry", true);

        //<author> might be omitted, e.g. during cvs2svn migration - we map this to anon user

        String author = parseAuthor("author");
        LinkedHashMap<String, FileStats> pathslist = new LinkedHashMap<String, FileStats>();
        //parse date element
        SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSSSS'Z'");
        Date date = format.parse(parseSingle("date"));

        //<paths> must have at least one element <path> but may have more
        boolean hasFile = parseMulti(new String[] { "paths", "path" }, pathslist);

        //parse commit msg
        String msg = parseSingle("msg");

        //store the commit in the DBy
        processCommit(author, date, pathslist, msg, revision);

        reader.expectEnd("logentry", true);
    }

    private String parseSingle(String element) throws XmlReaderException {
        reader.acceptStart(element, true);
        String content = reader.getText();
        reader.expectEnd(element, true);
        return content;
    }

    private class PathEntry {
        public String newPath;
        public ChangeType type;
        public String oldPath;
        public String copyFromRev;
        public boolean isFile;
    }

    private PathEntry parsePath(String element) throws XmlReaderException {

        PathEntry pathentry = new PathEntry();

        //Store all attributes before we get Text as it moves on to next tag
        boolean hasCopyPath = reader.hasAttribute("copyfrom-path");
        String action = reader.getAttribute("action");
        pathentry.isFile = reader.getAttribute("kind").equals("file");

        // TODO Check if we really only care about this, if it is a file      
        if (hasCopyPath) {
            pathentry.oldPath = reader.getAttribute("copyfrom-path");
            // we dont care about the revision the file is from not yet
            //pathentry.copyFromRev = reader.getAttribute("copyfrom-rev");
        }

        //step into the text element
        reader.acceptStart(element, true);
        pathentry.newPath = reader.getText();
        reader.acceptEnd(element, true);

        // If not a rename, the copy from path is always the existing path
        if (!hasCopyPath)
            pathentry.oldPath = pathentry.newPath;

        if (action.length() > 0) {
            switch (action.charAt(0)) {
            case 'A':
                if (!pathentry.oldPath.equals(pathentry.newPath)) {
                    pathentry.type = ChangeType.RENAME;
                } else {
                    pathentry.type = ChangeType.ADD;
                }

                break;
            case 'M':
                pathentry.type = ChangeType.MODIFY;
                break;
            case 'D':
                pathentry.type = ChangeType.DELETE;
                ;
                break;
            case 'R':

                //TODO: evaluate all replace actions
                //pathentry.type = ChangeType.REPLACE;

                //Assume replace with diff dir copyfrom is a bulk rename
                if (!pathentry.oldPath.equals(pathentry.newPath)) {
                    pathentry.type = ChangeType.RENAME;
                } else {
                    pathentry.type = ChangeType.ADD;
                }

                break;
            }
        }

        return pathentry;
    }

    private String parseAuthor(String element) throws XmlReaderException {
        if (!reader.acceptStart(element, true)) {
            return ANON_USER;
        }

        String content = reader.getText();
        reader.expectEnd(element, true);
        return content;
    }

    private boolean parseMulti(String[] elements, LinkedHashMap<String, FileStats> pathslist)
            throws XmlReaderException {
        reader.acceptStart(elements[0], true);

        //if logentry only has directories, skip logentry
        boolean hasFile = false;

        while (reader.isStart(elements[1])) {
            PathEntry pathentry = parsePath(elements[1]);
            if (!hasFile)
                hasFile = pathentry.isFile;
            // TODO Use more of SVN Attributes?

            pathslist.put(pathentry.newPath, new FileStats(pathentry.oldPath, pathentry.type, pathentry.isFile));

        }

        reader.acceptEnd(elements[0], true);
        return hasFile;
    }

    @Override
    public void run() throws MinerException {
        // TODO Auto-generated method stub
        try {
            model = pool.getModel();
            emitStart();

            model.addFlag(project, Model.FLAG_SRC_INFO);
            model.addFlag(project, Model.FLAG_SRC_FILE_STATS);

            _run();
            emitEnd();
        } catch (IOException e) {
            throw new MinerException("IO-Error: " + e.getMessage(), e);
        } catch (SQLException e) {
            throw new MinerException("SQL-Error: " + e.getMessage(), e);
        } catch (ParseException e) {
            throw new MinerException("Date Parsing-Error: " + e.getMessage(), e);
        } catch (XmlReaderException e) {
            throw new MinerException("XML Reader-Error: " + e.getMessage(), e);
        }

    }

    private void _run() throws IOException, MinerException, SQLException, ParseException, XmlReaderException {

        parse();
    }

    @Override
    public void stop() {
        // TODO Auto-generated method stub
        emitStop();
    }

    @Override
    public String getName() {
        return "SVN";
    }
}