edu.mit.lib.tools.Modernize.java Source code

Java tutorial

Introduction

Here is the source code for edu.mit.lib.tools.Modernize.java

Source

/**
 * Copyright 2014 MIT Libraries
 * Licensed under: http://www.apache.org/licenses/LICENSE-2.0
 */
package edu.mit.lib.tools;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.PathMatcher;
import java.nio.file.StandardOpenOption;
import java.sql.SQLException;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;
import java.util.Stack;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;

import static javax.xml.stream.XMLStreamConstants.*;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.lang.StringUtils;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.FileRequestEntity;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.RequestEntity;

import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
import org.dspace.content.Community;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.ItemIterator;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.handle.HandleManager;
import org.dspace.app.itemexport.ItemExport;

import edu.mit.lib.bagit.Bag;
import edu.mit.lib.bagit.Filler;

import edu.mit.lib.bagit.BagUtils;
import static edu.mit.lib.bagit.BagUtils.*;

/**
 * Class migrates repository data from a DSpace
 * repository to a remote MDS repository. Fairly fast & dirty.
 * @author richardrodgers
 */

public class Modernize {

    private Context context;
    private Path scratchDir;
    private ExportManifest manif;

    public Modernize(Path scratchDir) throws Exception {
        this.scratchDir = scratchDir;
        context = new Context();
        manif = new ExportManifest();
    }

    public static void main(String[] args) throws Exception {

        // create an options object and populate it
        CommandLineParser parser = new PosixParser();

        Options options = new Options();

        options.addOption("i", "identifier", true, "root handle to migrate - 'all' for entire repo");
        options.addOption("t", "target", true, "URL of mds repository to import into");
        options.addOption("s", "scratch", true, "scratch directory for processing");
        options.addOption("m", "migrate", false,
                "export for migration (remove handle and metadata that will be re-created in new system)");
        options.addOption("h", "help", false, "help");

        CommandLine line = parser.parse(options, args);

        if (line.hasOption('h')) {
            HelpFormatter myhelp = new HelpFormatter();
            myhelp.printHelp("Modernize\n", options);
            System.out.println(
                    "\nentire repository: Modernize -i all -t http://my-mds-repo.org/webapi -s /dspace/export");
            System.out.println(
                    "\ncontent subtree: Modernize -i 123456789/1 -t http://my-mds-repo.org/webapi -s /dspace/export");
            System.exit(0);
        }

        String scratch = null;
        if (line.hasOption('s')) {
            scratch = line.getOptionValue('s');
            if (scratch == null) {
                System.out.println("Scratch directory required!");
                System.exit(1);
            }
        }

        Modernize mod = new Modernize(Paths.get(scratch));

        if (line.hasOption('i')) {
            String id = line.getOptionValue('i');
            if (id != null) {
                mod.exportIdentifier(id);
            } else {
                mod.bail("Must provide an identifer!");
            }
        }

        if (line.hasOption('t')) {
            String targetUrl = line.getOptionValue('t');
            if (targetUrl != null) {
                mod.importToMds(targetUrl);
            } else {
                mod.bail("Must provide an URL to an mds repository!");
            }
        }

        mod.finish();
    }

    private void exportIdentifier(String id) throws IOException, SQLException, AuthorizeException {
        // validate the identifier
        if (!"all".equals(id)) {
            DSpaceObject dso = HandleManager.resolveToObject(context, id);
            if (dso == null) {
                bail("Unresolvable identifier: " + id);
            }
            // construct the manifest
            if (dso.getType() == Constants.COMMUNITY) {
                communityManifest((Community) dso);
            } else if (dso.getType() == Constants.COLLECTION) {
                collectionManifest((Collection) dso);
            } else {
                bail("Identifier: " + id + " is not a collection or community");
            }
        } else {
            repoManifest();
        }
        manifestToScratch();
        // flush manifest to disk fro possible future use
        manif.write();
    }

    private void bail(String message) throws SQLException {
        System.out.println(message);
        finish();
        System.exit(1);
    }

    private void repoManifest() throws IOException, SQLException {
        for (Community topComm : Community.findAllTop(context)) {
            communityManifest(topComm);
        }
    }

    private void communityManifest(Community comm) throws IOException, SQLException {
        Stack<Community> parents = new Stack<>();
        Community parent = comm.getParentCommunity();
        while (parent != null) {
            parents.push(parent);
            parent = parent.getParentCommunity();
        }
        int level = manif.addParents(parents);
        manif.addCommunity(comm, level);
    }

    private void collectionManifest(Collection coll) throws IOException, SQLException {
        Stack<Community> parents = new Stack<>();
        Community parent = (Community) coll.getParentObject();
        while (parent != null) {
            parents.push(parent);
            parent = parent.getParentCommunity();
        }
        int level = manif.addParents(parents);
        manif.addCollection(coll, level);
    }

    public void manifestToScratch() throws IOException, SQLException, AuthorizeException {
        // Just create a SIP package for each line in manifest and put in scratch directory
        if (manif.isEmpty()) {
            manif.read();
        }
        for (String handle : manif.entries) {
            DSpaceObject dso = HandleManager.resolveToObject(context, handle);
            if (dso == null) {
                bail("Unresolvable identifier: " + handle);
            }
            switch (dso.getType()) {
            case Constants.COMMUNITY:
                makeCommPackage((Community) dso);
            case Constants.COLLECTION:
                makeCollPackage((Collection) dso);
            case Constants.ITEM:
                makeItemPackage((Item) dso);
            }
        }
    }

    public void importToMds(String targetUrl) throws IOException {
        if (manif.isEmpty()) {
            manif.read();
        }
        Stack<String> parents = new Stack<>();
        parents.push(null); // indicates no parent object

        for (int i = 0; i < manif.entries.size(); i++) {
            String handle = manif.entries.get(i);
            uploadPackage(getPackage(handle), getPostUrl(targetUrl, parents.peek(), manif.ctypes.get(i)));
            if (i < manif.entries.size() - 1) {
                int diff = manif.levels.get(i) - manif.levels.get(i + 1);
                if (diff < 0) {
                    // I have kids - put myself on the parents stack
                    parents.push(handle);
                } else if (diff > 0) {
                    // expose grandparents
                    while (diff-- > 0) {
                        parents.pop();
                    }
                } // if diff == 0 - next entry is a sibling, nothing to do
            }
        }
    }

    private String getPostUrl(String targetUrl, String handle, int ctype) {
        // NB: these URLs are a bit fragile - really should be queried from REST API
        String baseUrl = targetUrl;
        if (!baseUrl.endsWith("/")) {
            baseUrl += "/";
        }
        String pkgName = "package/" + Constants.typeText[ctype] + "-sip";
        return (handle != null) ? baseUrl + handle + pkgName : baseUrl + pkgName;
    }

    private Path getPackage(String handle) {
        return scratchDir.resolve(handle.replaceAll("/", "-") + ".zip");
    }

    private static final String[] commFields = { "name", "short_description", "introductory_text", "copyright_text",
            "side_bar_text" };

    private Path makeCommPackage(Community comm) throws IOException, SQLException, AuthorizeException {
        Filler filler = new Filler(scratchDir.resolve(comm.getHandle().replaceAll("/", "-")));
        filler.metadata(BAG_TYPE, "SIP");
        filler.property("data/object", OBJECT_TYPE, "community");
        filler.property("data/object", OBJECT_ID, comm.getHandle());
        Community parent = comm.getParentCommunity();
        if (parent != null) {
            filler.property("data/object", OWNER_ID, parent.getHandle());
        }
        // metadata
        OutputStream metaOut = filler.payloadStream("metadata.xml");
        XmlWriter writer = xmlWriter(metaOut);
        writer.startStanza("metadata");
        for (String field : commFields) {
            String val = comm.getMetadata(field);
            if (val != null) {
                writer.writeValue(field, val);
            }
        }
        writer.endStanza();
        writer.close();
        // check for logo
        Bitstream logo = comm.getLogo();
        if (logo != null) {
            filler.payload("logo", logo.retrieve());
        }
        return filler.toPackage();
    }

    private static final String[] collFields = { "name", "short_description", "introductory_text",
            "provenance_description", "license", "copyright_text", "side_bar_text" };

    private Path makeCollPackage(Collection coll) throws IOException, SQLException, AuthorizeException {
        Filler filler = new Filler(scratchDir.resolve(coll.getHandle().replaceAll("/", "-")));
        filler.metadata(BAG_TYPE, "SIP");
        filler.property("data/object", OBJECT_TYPE, "collection");
        filler.property("data/object", OBJECT_ID, coll.getHandle());
        DSpaceObject parent = coll.getParentObject();
        if (parent != null) {
            filler.property("data/object", OWNER_ID, parent.getHandle());
        }
        // metadata
        OutputStream metaOut = filler.payloadStream("metadata.xml");
        XmlWriter writer = xmlWriter(metaOut);
        writer.startStanza("metadata");
        for (String field : collFields) {
            String val = coll.getMetadata(field);
            if (val != null) {
                writer.writeValue(field, val);
            }
        }
        writer.endStanza();
        writer.close();
        // check for logo
        Bitstream logo = coll.getLogo();
        if (logo != null) {
            filler.payload("logo", logo.retrieve());
        }
        return filler.toPackage();
    }

    private Path makeItemPackage(Item item) throws IOException, SQLException, AuthorizeException {
        Filler filler = new Filler(scratchDir.resolve(item.getHandle().replaceAll("/", "-")));
        filler.metadata(BAG_TYPE, "SIP");
        filler.property("data/object", OBJECT_TYPE, "item");
        filler.property("data/object", OBJECT_ID, item.getHandle());
        // get collections
        StringBuilder linked = new StringBuilder();
        for (Collection coll : item.getCollections()) {
            if (item.isOwningCollection(coll)) {
                filler.property("data/object", OWNER_ID, coll.getHandle());
            } else {
                linked.append(coll.getHandle()).append(",");
            }
        }
        String linkedStr = linked.toString();
        if (linkedStr.length() > 0) {
            filler.property("data/object", OTHER_IDS, linkedStr.substring(0, linkedStr.length() - 2));
        }
        if (item.isWithdrawn()) {
            filler.property("data/object", WITHDRAWN, "true");
        }
        // metadata
        BagUtils.writeMetadata(item, filler.payloadStream("metadata.xml"));
        // proceed to bundles, in sub-directories, excluding bundles with derivatives
        for (Bundle bundle : item.getBundles()) {
            if (!"TEXT".equals(bundle.getName())) {
                // only bundle metadata is the primary bitstream - remember it
                // and place in bitstream metadata if defined
                int primaryId = bundle.getPrimaryBitstreamID();
                for (Bitstream bs : bundle.getBitstreams()) {
                    // write metadata to xml file
                    String seqId = String.valueOf(bs.getSequenceID());
                    String relPath = bundle.getName() + "/";
                    OutputStream metaOut = filler.payloadStream(relPath + seqId + "-metadata.xml");
                    XmlWriter writer = xmlWriter(metaOut);
                    writer.startStanza("metadata");
                    // field access is hard-coded in Bitstream class, ugh!
                    writer.writeValue("name", bs.getName());
                    writer.writeValue("source", bs.getSource());
                    writer.writeValue("description", bs.getDescription());
                    writer.writeValue("sequence_id", seqId);
                    if (bs.getID() == primaryId) {
                        writer.writeValue("bundle_primary", "true");
                    }
                    writer.endStanza();
                    writer.close();
                    // add bytes to bag
                    filler.payload(relPath + seqId, bs.retrieve());
                }
            }
        }
        return filler.toPackage();
    }

    private void uploadPackage(Path pkg, String targetUri) throws IOException {
        // using older Apache http client library to make compatible with more systems
        PostMethod post = new PostMethod(targetUri);
        HttpClient client = new HttpClient();
        RequestEntity entity = new FileRequestEntity(pkg.toFile(), "application/zip");
        post.setRequestEntity(entity);
        try {
            int result = client.executeMethod(post);
        } finally {
            post.releaseConnection();
        }
    }

    private void finish() throws SQLException {
        context.abort();
    }

    // map of content subtree - serialized as YAML file
    private class ExportManifest {

        List<String> entries = new ArrayList<>();
        List<Integer> ctypes = new ArrayList<>();
        List<Integer> levels = new ArrayList<>();
        Path exportMap;

        public ExportManifest() throws IOException {
            exportMap = scratchDir.resolve("export.map");
        }

        public boolean isEmpty() {
            return (entries.size() == 0);
        }

        public int addParents(Stack<Community> parents) throws IOException {
            int level = 0;
            if (!parents.empty()) {
                Community parent = parents.pop();
                while (parent != null) {
                    addHandle(parent.getHandle(), level++, Constants.COMMUNITY);
                    parent = parents.pop();
                }
            }
            return level;
        }

        public void addCommunity(Community comm, int level) throws IOException, SQLException {
            addHandle(comm.getHandle(), level, Constants.COMMUNITY);
            for (Community subComm : comm.getSubcommunities()) {
                addCommunity(subComm, level + 1);
            }
            for (Collection coll : comm.getCollections()) {
                addCollection(coll, level + 1);
            }
        }

        public void addCollection(Collection coll, int level) throws IOException, SQLException {
            addHandle(coll.getHandle(), level, Constants.COLLECTION);
            ItemIterator iiter = coll.getItems();
            while (iiter.hasNext()) {
                addHandle(iiter.next().getHandle(), level + 1, Constants.ITEM);
            }
            iiter.close();
        }

        public void write() throws IOException {
            try (BufferedWriter writer = Files.newBufferedWriter(exportMap, StandardCharsets.UTF_8)) {
                int lineNo = 0;
                for (String entry : entries) {
                    writer.write(levels.get(lineNo));
                    writer.write(" ");
                    writer.write(ctypes.get(lineNo));
                    writer.write(" ");
                    writer.write(entry);
                    writer.newLine();
                    ++lineNo;
                }
            }
        }

        public void read() throws IOException {
            try (Scanner scanner = new Scanner(exportMap, StandardCharsets.UTF_8.name())) {
                while (scanner.hasNextLine()) {
                    levels.add(scanner.nextInt());
                    ctypes.add(scanner.nextInt());
                    entries.add(scanner.next());
                    scanner.nextLine();
                }
            }
        }

        private void addHandle(String handle, int level, int ctype) throws IOException {
            levels.add(level);
            ctypes.add(ctype);
            entries.add(handle);
        }
    }
}