nz.ac.waikato.its.irr.scripts.FixSquishedMetadata.java Source code

Java tutorial

Introduction

Here is the source code for nz.ac.waikato.its.irr.scripts.FixSquishedMetadata.java

Source

/*
 * This file is a part of the lconz-scripts project.
 * The contents of this file are subject to the license and copyright detailed in the LICENSE file at the root of the source tree.
 */

package nz.ac.waikato.its.irr.scripts;

import org.apache.commons.cli.*;
import org.apache.commons.lang.StringUtils;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.*;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.handle.HandleManager;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

/**
 * Script to split multi-value keywords (separated by a given delimiter string) into individual values.
 *
 * @author Andrea Schweer schweer@waikato.ac.nz for the UoW Institutional Research Repositories
 */
public class FixSquishedMetadata {
    private static final Options OPTIONS = new Options();

    static {
        Option option = new Option("f", "field", true, "The metadata field to process. Required.");
        option.setRequired(true);
        OPTIONS.addOption(option);
        option = new Option("d", "delimiter", true, "Delimiter string for squished keywords. Required.");
        option.setRequired(true);
        OPTIONS.addOption(option);
        OPTIONS.addOption("m", "matches", true,
                "Number of times the delimiter must occur to count as a match. Optional. Default is 1.");
        OPTIONS.addOption("n", "dry-run", false,
                "If given, do not actually make any changes; instead, print out what would have been changed without this flag. Optional.");
        OPTIONS.addOption("i", "identifier", true,
                "Handle of DSpace object to process. If omitted, all items will be processed. Optional.");
        OPTIONS.addOption("h", "help", false, "Print help for this command and exit without taking any action.");
    }

    public static void main(String[] args) {
        CommandLine line = null;
        try {
            line = new BasicParser().parse(OPTIONS, args);
        } catch (ParseException e) {
            System.err.println("Could not parse command line options: " + e.getMessage());
            ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 1, OPTIONS);
        }

        if (line == null || line.hasOption("h")) {
            ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 0, OPTIONS);
        }

        Context context = null;
        try {
            context = new Context();
            context.turnOffAuthorisationSystem();

            DSpaceObject dso = null;
            if (line.hasOption("i")) {
                String handle = line.getOptionValue("i");
                dso = HandleManager.resolveToObject(context, handle);
                if (dso == null) {
                    System.err.println("Could not resolve identifier " + handle + " to a DSpace object");
                    ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 1, OPTIONS);
                }
            }

            String delimiter = line.getOptionValue("d");
            int minMatches = 1;
            if (line.hasOption("m")) {
                try {
                    minMatches = Integer.valueOf(line.getOptionValue("m"));
                } catch (NumberFormatException e) {
                    System.err.println("Could not parse min matches value (" + line.getOptionValue("m")
                            + ") as a number :" + e.getMessage());
                    ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 1, OPTIONS);
                }
            }
            boolean dryRun = line.hasOption("n");

            String schema, element, qualifier;
            String[] fieldComponents = line.getOptionValue("f", "").split("\\.");
            if (fieldComponents.length < 2) {
                System.err.println("Unsupported metadata field name: " + line.getOptionValue("f"));
                ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 1, OPTIONS);
            }
            schema = fieldComponents[0];
            element = fieldComponents[1];
            qualifier = fieldComponents.length > 2 ? fieldComponents[2] : null;

            boolean changes = false;
            if (dso == null || dso.getType() == Constants.SITE) {
                changes = process(Item.findByMetadataField(context, schema, element, qualifier, Item.ANY), schema,
                        element, qualifier, delimiter, minMatches, dryRun);
            } else if (dso.getType() == Constants.COMMUNITY) {
                Collection[] collections = ((Community) dso).getAllCollections();
                for (Collection collection : collections) {
                    changes |= process(collection.getAllItems(), schema, element, qualifier, delimiter, minMatches,
                            dryRun);
                }
            } else if (dso.getType() == Constants.COLLECTION) {
                changes = process(((Collection) dso).getAllItems(), schema, element, qualifier, delimiter,
                        minMatches, dryRun);
            } else if (dso.getType() == Constants.ITEM) {
                changes = process((Item) dso, schema, element, qualifier, delimiter, minMatches, dryRun);
            } else {
                System.err.println("Unsupported type of DSpace object: " + dso.getTypeText()
                        + ", need site, community, collection or item handle");
                ScriptUtils.printHelpAndExit(FixSquishedMetadata.class.getSimpleName(), 1, OPTIONS);
            }
            if (changes) {
                context.complete();
            }
        } catch (SQLException | AuthorizeException | IOException e) {
            e.printStackTrace(System.err);
        } finally {
            if (context != null && context.isValid()) {
                context.abort();
            }
        }
    }

    private static boolean process(ItemIterator items, String schema, String element, String qualifier,
            String delimiter, int minMatches, boolean dryRun) throws SQLException, AuthorizeException {
        boolean changes = false;
        while (items.hasNext()) {
            changes |= process(items.next(), schema, element, qualifier, delimiter, minMatches, dryRun);
        }
        return changes;
    }

    private static boolean process(Item item, String schema, String element, String qualifier, String delimiter,
            int minMatches, boolean dryRun) throws SQLException, AuthorizeException {
        boolean changes = false;
        List<Metadatum> newMetadata = new ArrayList<>();

        Metadatum[] allMd = item.getMetadata(schema, element, qualifier, Item.ANY);
        for (Metadatum md : allMd) {
            if (StringUtils.isNotBlank(md.value) && StringUtils.countMatches(md.value, delimiter) >= minMatches) {
                String[] individualValues = StringUtils.splitByWholeSeparator(md.value, delimiter);
                for (int i = 0; i < individualValues.length; i++) {
                    individualValues[i] = individualValues[i].replaceAll("(\\r|\\n|\\t)", " ").replaceAll("  ", " ")
                            .trim();
                }
                System.out.println("item id=" + item.getID() + ": split |" + md.value + "| into |"
                        + StringUtils.join(individualValues, '|') + "|");
                if (!dryRun) {
                    for (String individualValue : individualValues) {
                        if (StringUtils.isNotBlank(individualValue)) {
                            Metadatum newMd = new Metadatum();
                            newMd.language = md.language;
                            newMd.value = individualValue;
                            newMetadata.add(newMd);
                        }
                    }
                    changes = true;
                }
            } else {
                newMetadata.add(md);
            }
        }
        if (!dryRun && changes) {
            item.clearMetadata(schema, element, qualifier, Item.ANY);
            for (Metadatum newMd : newMetadata) {
                item.addMetadata(schema, element, qualifier, newMd.language, newMd.value, newMd.authority,
                        newMd.confidence);
            }
            item.updateMetadata();
        }
        return changes;
    }
}