trigeditor.editors.PrefixRefactor.java Source code

Introduction

Here is the source code for trigeditor.editors.PrefixRefactor.java
Source

/***********************************************************************
 * Copyright (c) 2014 Cambridge Semantics Incorporated.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *     Cambridge Semantics Incorporated - initial API and implementation
 ***********************************************************************/

package trigeditor.editors;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;

/**
 * The PrefixRefactor Object handles prefix refactoring for TriG files by the TriGEditor.
 * 
 * Prefix refactoring requires parsing the file for URIs and prefixes. URIs that appear
 * frequently and haven't been matched to a prefix yet are refactored. After generating
 * a prefix name, the prefix name and matching URI are added to the list of prefixes, and
 * all occurrences of the URI are replaced by the prefix name.
 * 
 * @author Natasha
 *
 */
public class PrefixRefactor {

    //Data structures to keep track of the URIs and/or matched prefixes in a file
    private Map<String, Integer> urisFrequency = new HashMap<String, Integer>();
    private Map<String, List<String>> urisPrefixes = new HashMap<String, List<String>>();

    /**
     * Empty constructor of a PrefixRefactor object
     */
    public PrefixRefactor() {

    }

    /**
     * Helper function to refactor prefixes.
     * 
     * Creates the String prefix name from a URI, passed in as a parameter. The prefix name
     * produced is no longer than 8 alpha-characters. If the URI has capital letters, those 
     * characters are used to form the prefix name. Otherwise, the characters before the last '#'
     * or after the last '/' are used to form the prefix name. The prefix name formed is lowercase.
     * If a prefix name cannot be formed, "error" is returned.
     * 
     * Based off of the .net function that creates a prefix from a given URI.
     * 
     * @param namespace String representing the URI whose prefix name will be created
     * @return prefix name representing the URI
     */
    private String prefixFromNamespace(String namespace) {
        String prefix = "";
        final String alphanumRegex = "^[a-zA-Z_]*$";
        final String alphaRegex = "^[a-zA-Z]*$";

        int start = -1;
        int end = -1;

        for (int i = namespace.length() - 1; i > 0; i--) {
            char c = namespace.charAt(i);
            String cstr = "" + c;
            if (c == '/' | c == '#' | c == ':') {
                if (end != -1 && start != -1) {
                    break;
                }
                end = -1;
                start = -1;
            }

            if (end == -1) {
                if (cstr.matches(alphanumRegex)) {
                    end = i;
                }
            } else {
                if (cstr.matches(alphaRegex) || start != -1) {
                    start = i;
                } else {
                    start = -1;
                }
            }
        }

        //guarantees that the prefix name will only contain letters
        if (end != -1 && start != -1) {
            for (int i = start; i < end + 1; i++) {
                String charStr = "" + namespace.charAt(i);
                if (charStr.matches(alphaRegex)) {
                    prefix = prefix.concat("" + namespace.charAt(i));
                }
            }

            //guarantees the prefix name isn't longer than 8 characters
            if (prefix.length() > 8) {
                StringBuilder sb = new StringBuilder();
                sb.append(prefix.charAt(0));
                for (int j = 1; j < prefix.length(); j++) {
                    if (Character.isUpperCase(prefix.charAt(j))) {
                        sb.append(prefix.charAt(j));
                    }
                }
                if (sb.length() > 1) {
                    prefix = sb.toString();
                } else {
                    prefix = prefix.substring(0, 8);
                }
            }

            //guarantees unique prefix names are used for different URIs
            if (urisPrefixes.containsValue(prefix)) {
                prefix = prefix.concat("Two");
            }
            return prefix.toLowerCase();
        }

        return "error";
    }

    /**
     * Helper function to refactor prefixes.
     * 
     * Creates a map that maps URIs parsed to the number of times each URI appears. Parsing is based
     * off of how URIs can appear in a TriG file. This finds all URIs that appear in high frequency
     * in a file (more than 4 times).
     * @param s String representing the contents being parsed to find URIs. In most cases, the 
     * string representation of a TriG file.
     * @return map containing all the URIs found and the number of times each appears, only if they
     * appear more than 4 times.
     * @throws IOException
     */
    private Map<String, Integer> findURI(String s) throws IOException {
        boolean isURI = false;
        StringBuilder uri = new StringBuilder();
        String newURI = "";
        List<String> uriList = new ArrayList<String>();
        Map<String, Integer> uriFreq = new HashMap<String, Integer>();
        char[] chars = s.toCharArray();

        char c = chars[0];

        for (int i = 1; i < chars.length; i++) {
            if (c == '<') {
                isURI = true;
                //don't append < character
                c = chars[i];
            } else if (isURI && c != '>') {
                //creates URI with text up to first instance of #
                if (c == '#') {
                    uri.append((char) c);
                    newURI = uri.toString();
                    if (!uriFreq.containsKey(newURI)) {
                        uriFreq.put(newURI, 1);
                    } else {
                        uriFreq.put(newURI, uriFreq.get(newURI) + 1);
                    }
                    uriList.add(newURI);
                    uri.setLength(0);
                    isURI = false;
                } else {
                    uri.append((char) c);
                }
                c = chars[i];
            }
            if (isURI && c == '>') {
                isURI = false;
                newURI = uri.toString();
                if (!uriFreq.containsKey(newURI)) {
                    uriFreq.put(newURI, 1);
                } else {
                    uriFreq.put(newURI, uriFreq.get(newURI) + 1);
                }
                uriList.add(newURI);
                uri.setLength(0);
                c = chars[i];
            }
            c = chars[i];
        }

        //holds URIs that appear at least 5 times.
        Map<String, Integer> uriHighFreq = new HashMap<String, Integer>();
        for (String str : uriFreq.keySet()) {
            if (uriFreq.get(str) > 4) {
                uriHighFreq.put(str, uriFreq.get(str));
            }
        }

        for (String uriName : uriHighFreq.keySet()) {
            if (prefixMatchURI(s).containsKey(uriName)) {
                uriHighFreq.remove(uriName);
            }
        }
        urisFrequency = uriHighFreq;
        return uriHighFreq;
    }

    /**
     * Helper function to refactor prefixes.
     * 
     * Creates a map of all existing URIs to their matching prefix names in a string. This is used
     * to ensure that a prefix name isn't generated for a URI that has already been matched.
     * In addition, it ensures that the same prefix name isn't mapped to unique URIs. A URI
     * can be matched to multiple prefix names (although not recommended).
     * @param s String being parsed for prefixes and matching URIs. This String is generally the
     * string representation of a file.
     * @return a map that maps all URIs to the list of prefix names that match each URI.
     */
    private Map<String, List<String>> prefixMatchURI(String s) {
        Map<String, List<String>> uriMatchings = new HashMap<String, List<String>>();
        StringBuilder uri = new StringBuilder();
        String uriMatching = "";
        StringBuilder prefix = new StringBuilder();
        String prefixID = "";
        char[] chars = s.toCharArray();
        boolean isPrefix = false;
        boolean isPrefixID = false;

        int i = 0;
        while (i < chars.length - 1) {
            if (i < chars.length - 8) {
                if (chars[i] == '@' && chars[i + 1] == 'p' && chars[i + 2] == 'r' && chars[i + 3] == 'e'
                        && chars[i + 4] == 'f' && chars[i + 5] == 'i' && chars[i + 6] == 'x') {
                    isPrefix = true;
                    isPrefixID = true;
                    i += 7;
                } else if (isPrefix) {
                    if (isPrefixID && chars[i] != ':') {
                        while (i < chars.length - 1 && chars[i] != ':') {
                            if (chars[i] != ' ') {
                                prefix.append(chars[i]);
                            }
                            i++;
                        }
                        prefixID = prefix.toString();

                        isPrefixID = false;
                        i++;
                    } else {
                        while (i < chars.length - 1 && chars[i] != '<') {
                            i++;
                        }
                        if (chars[i] == '<') {
                            i++;
                            while (i < chars.length - 1 && (chars[i] != '#' && chars[i] != '>')) {
                                uri.append(chars[i]);
                                i++;
                            }
                            if (chars[i] == '#' || chars[i] == '>') {
                                uriMatching = uri.toString();
                                if (uriMatchings.get(uriMatching) == null) {
                                    uriMatchings.put(uriMatching, new ArrayList<String>());
                                }
                                uriMatchings.get(uriMatching).add(0, prefixID);
                                uriMatchings.put("URI: " + uriMatching, uriMatchings.get(uriMatching));
                                isPrefix = false;
                                uriMatching = "";
                                prefixID = "";
                                uri.setLength(0);
                                prefix.setLength(0);
                            }
                        }
                    }
                } else {
                    i++;
                }
            } else {
                i++;
            }

        }

        urisPrefixes = uriMatchings;
        return uriMatchings;
    }

    /**
     * Generates prefixes for URIs that appear in a string at least 5 times, given that they do
     * not already have a prefix name. This map holds the new URIs and the respective prefix
     * name generated for them. These will be added to the string during the refactoring.
     * 
     * @param s String from which the prefixes are generated. This String is generally the
     * String representation of a file.
     * @return map containing the URIs matching prefix names to be added to the file through
     * the refactoring.
     * @throws IOException
     */
    private Map<String, String> generatePrefixes(String s) throws IOException {
        Map<String, String> prefixMatchings = new HashMap<String, String>();
        findURI(s);
        prefixMatchURI(s);
        for (String str : urisFrequency.keySet()) {
            prefixMatchings.put(str, prefixFromNamespace(str));
        }

        return prefixMatchings;
    }

    /**
     * Function that refactors prefixes. For URIs that haven't been matched to a prefix yet
     * and appear in the file at least 5 times, this function adds the prefix name and URI 
     * to the top of the list of prefixes in the file. In addition, all occurrences of the URI
     * are replaced by the prefix name. If there is additional text in the URI, the text is 
     * separated by ':' from the prefix name.
     * 
     * Additionally, the same URI should not appear multiple times in a triple statement.
     * 
     * @param input String representation of the file whose prefixes are being refactored.
     * @param fileName String representation of the path of the file
     * @throws IOException
     */
    public void refactorPrefixes(String input, String filePath) throws IOException {
        File f = new File(filePath);

        //prevent concurrency issues: using different data structures to avoid access/changing at same time
        List<String> lines = FileUtils.readLines(f);
        List<String> tempLines = new ArrayList<String>();
        List<String> tempLinesRename = new ArrayList<String>();

        Map<String, String> tempPrefixes = new HashMap<String, String>();
        Map<String, String> tempURIRename = new HashMap<String, String>();
        tempPrefixes = generatePrefixes(input);
        tempURIRename = generatePrefixes(input);

        //Add prefix name to list of prefixes
        int index = 1;
        for (String s : lines) {
            tempLines.add(s);
            if (s.contains("@prefix") && !tempPrefixes.isEmpty()) {
                for (String URI : tempPrefixes.keySet()) {
                    tempLines.add(index - 1, "@prefix " + tempPrefixes.get(URI) + "   :   " + "<" + URI + "> .");
                    tempPrefixes.remove(URI);
                }
            }

            index++;
        }
        //case where there are no prefixes listed in the file
        if (!tempPrefixes.isEmpty()) {
            for (String URI : tempPrefixes.keySet()) {
                tempLines.add(0, "@prefix " + tempPrefixes.get(URI) + "   :   " + "<" + URI + "> .");
                tempPrefixes.remove(URI);
            }
        }

        //Replace URIs with prefix name in document
        int index2 = 0;
        for (String s : tempLines) {
            tempLinesRename.add(s);
            for (String uri : tempURIRename.keySet()) {
                String strAtLine = tempLinesRename.get(index2);
                if (strAtLine.contains(uri) && !strAtLine.contains("@prefix")) {
                    int start = strAtLine.indexOf(uri) + uri.length();
                    String strAtLineNew = "";
                    if (start > 0) {
                        int i = start;
                        String temp = strAtLine;
                        if (strAtLine.charAt(start) == '#' || strAtLine.charAt(start) == '/') {
                            temp = strAtLine.substring(0, start) + ":" + strAtLine.substring(start + 1);
                        }
                        while (temp.charAt(i) != '>') {
                            i++;
                        }

                        if (i != temp.length() - 1) {
                            strAtLineNew = temp.substring(0, i) + temp.substring(i + 1);
                        } else {
                            strAtLineNew = temp.substring(0, i);
                        }
                    }

                    CharSequence target = "<" + uri;
                    CharSequence literal = tempURIRename.get(uri);
                    tempLinesRename.set(index2, strAtLineNew.replace(target, literal));

                }
            }
            index2++;
        }

        FileUtils.writeLines(f, tempLinesRename);
    }

}