org.researchgraph.crosswalk.CrosswalkRG.java Source code

Java tutorial

Introduction

Here is the source code for org.researchgraph.crosswalk.CrosswalkRG.java

Source

package org.researchgraph.crosswalk;

import java.io.InputStream;
import java.io.PrintStream;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.TimeZone;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.datatype.XMLGregorianCalendar;

import org.apache.commons.lang.StringUtils;

import org.openarchives.oai._2.HeaderType;
import org.openarchives.oai._2.ListRecordsType;
import org.openarchives.oai._2.OAIPMHtype;
import org.openarchives.oai._2.RecordType;
import org.openarchives.oai._2.StatusType;
import org.researchgraph.graph.Graph;
import org.researchgraph.graph.GraphIndex;
import org.researchgraph.graph.GraphKey;
import org.researchgraph.graph.GraphNode;
import org.researchgraph.graph.GraphRelationship;
import org.researchgraph.graph.GraphSchema;
import org.researchgraph.graph.GraphUtils;
import org.researchgraph.graph.interfaces.GraphCrosswalk;
import org.researchgraph.schema.v2_0.xml.nodes.Dataset;
import org.researchgraph.schema.v2_0.xml.nodes.Grant;
import org.researchgraph.schema.v2_0.xml.nodes.Publication;
import org.researchgraph.schema.v2_0.xml.nodes.RegistryObjects;
import org.researchgraph.schema.v2_0.xml.nodes.Relation;
import org.researchgraph.schema.v2_0.xml.nodes.Researcher;

public class CrosswalkRG implements GraphCrosswalk {

    private static final SimpleDateFormat formatter;

    static {
        formatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX");
        formatter.setTimeZone(TimeZone.getTimeZone("UTC"));
    }

    public enum XmlType {
        oai, rg
    }

    private Unmarshaller unmarshaller;
    private long existingRecords = 0;
    private long deletedRecords = 0;
    private long brokenRecords = 0;
    private long filesCounter = 0;
    private long markTime = 0;

    private XmlType type = XmlType.oai;

    private boolean verbose = false;

    private String source = null;
    //   private boolean needAndsGroup = true;

    public CrosswalkRG() throws JAXBException {
        unmarshaller = JAXBContext.newInstance("org.openarchives.oai._2:org.researchgraph.schema.v2_0.xml.nodes")
                .createUnmarshaller();
    }

    public long getExistingRecords() {
        return existingRecords;
    }

    public long getDeletedRecords() {
        return deletedRecords;
    }

    public long getBrokenRecords() {
        return brokenRecords;
    }

    public long getFilesCounter() {
        return filesCounter;
    }

    public long getMarkTime() {
        return markTime;
    }

    public long getSpentTime() {
        return markTime == 0 ? 0 : System.currentTimeMillis() - markTime;
    }

    public boolean isVerbose() {
        return verbose;
    }

    public void setVerbose(boolean verbose) {
        this.verbose = verbose;
    }

    public void resetCounters() {
        existingRecords = deletedRecords = brokenRecords = filesCounter = markTime = 0;
    }

    public void mark() {
        markTime = System.currentTimeMillis();
    }

    public XmlType getType() {
        return type;
    }

    public void setType(XmlType type) {
        this.type = type;
    }

    @Override
    public void setSource(String source) {
        this.source = source;
        //   this.needAndsGroup = GraphUtils.SOURCE_ANDS.equals(source);
    }

    @Override
    public String getSource() {
        return source;
    }

    @Override
    public Graph process(InputStream xml) throws Exception {
        if (0 == markTime)
            markTime = System.currentTimeMillis();

        ++filesCounter;

        Graph graph = new Graph();
        graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_KEY, true));
        graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_NLA, false));
        //   graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_NHMRC_ID, false));
        //   graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_ARC_ID, false));
        graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_ORCID_ID, false));
        graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_DOI, false));
        graph.addSchema(new GraphSchema(source, GraphUtils.PROPERTY_PURL, false));

        if (type == XmlType.oai)
            processOai((JAXBElement<?>) unmarshaller.unmarshal(xml), graph);
        else
            processRegistryObjects((RegistryObjects) unmarshaller.unmarshal(xml), graph, false);

        return graph;
    }

    public void printStatistics(PrintStream out) {
        long spentTime = getSpentTime();
        out.println(String.format(
                "Processed %d files.\nSpent %d millisecods.\nFound %d records.\nFound %d deleted records.\nFound %d broken records.\nSpent ~ %f milliseconds per record.",
                filesCounter, spentTime, existingRecords, deletedRecords, brokenRecords,
                (float) spentTime / (float) existingRecords));
    }

    private void processOai(JAXBElement<?> element, Graph graph) throws Exception {
        OAIPMHtype root = (OAIPMHtype) element.getValue();
        ListRecordsType records = root.getListRecords();
        if (null != records && null != records.getRecord()) {
            for (RecordType record : records.getRecord()) {
                HeaderType header = record.getHeader();

                StatusType status = header.getStatus();
                boolean deleted = status == StatusType.DELETED;

                if (null != record.getMetadata()) {
                    Object metadata = record.getMetadata().getAny();
                    //   System.out.println(metadata.getClass().toString());
                    if (metadata instanceof RegistryObjects)
                        processRegistryObjects((RegistryObjects) metadata, graph, deleted);
                    else
                        throw new Exception("Metadata is not in rif format");
                } else
                    throw new Exception("Unable to find metadata");
            }
        } else
            System.out.println("Unable to find records");
    }

    private void processRegistryObjects(RegistryObjects registryObjects, Graph graph, boolean deleted)
            throws Exception {
        if (null != registryObjects) {
            if (null != registryObjects.getResearchers()) {
                for (Researcher researcher : registryObjects.getResearchers().getResearcher()) {
                    processResearcher(researcher, graph, deleted);
                }
            }

            if (null != registryObjects.getGrants()) {
                for (Grant grant : registryObjects.getGrants().getGrant()) {
                    processGrant(grant, graph, deleted);
                }
            }

            if (null != registryObjects.getDatasets()) {
                for (Dataset dataset : registryObjects.getDatasets().getDataset()) {
                    processDataset(dataset, graph, deleted);
                }
            }

            if (null != registryObjects.getPublications()) {
                for (Publication publication : registryObjects.getPublications().getPublication()) {
                    processPublication(publication, graph, deleted);
                }
            }

            if (null != registryObjects.getRelations()) {
                for (Relation relation : registryObjects.getRelations().getRelation()) {
                    processRelation(relation, graph);
                }
            }
        }
    }

    private boolean processResearcher(final Researcher researcher, final Graph graph, boolean deleted) {
        ++existingRecords;

        if (verbose)
            System.out.println("Processing Researcher");

        String key = researcher.getKey();
        if (StringUtils.isEmpty(key)) {
            return false;
        }

        if (verbose)
            System.out.println("Key: " + key);

        String source = researcher.getSource();
        if (StringUtils.isEmpty(source))
            source = this.source;

        GraphNode node = GraphNode.builder().withKey(new GraphKey(source, key)).withNodeSource(source)
                .withNodeType(GraphUtils.TYPE_RESEARCHER).withLabel(this.source)
                .withLabel(GraphUtils.TYPE_RESEARCHER).build();

        if (deleted) {
            node.setDeleted(true);
            graph.addNode(node);

            ++deletedRecords;

            return true;
        }

        String localId = researcher.getLocalId();
        if (!StringUtils.isEmpty(localId))
            node.setProperty(GraphUtils.PROPERTY_LOCAL_ID, localId);

        XMLGregorianCalendar lastUpdated = researcher.getLastUpdated();
        if (null != lastUpdated) {
            String lastUpdatedString = formatter.format(lastUpdated.toGregorianCalendar().getTime());
            if (!StringUtils.isEmpty(lastUpdatedString))
                node.setProperty(GraphUtils.PROPERTY_LAST_UPDATED, lastUpdatedString);
        }

        String url = GraphUtils.extractFormalizedUrl(researcher.getUrl());
        if (!StringUtils.isEmpty(url))
            node.setProperty(GraphUtils.PROPERTY_URL, url);

        String fullName = researcher.getFullName();
        if (!StringUtils.isEmpty(fullName))
            node.setProperty(GraphUtils.PROPERTY_FULL_NAME, fullName);

        String firstName = researcher.getFirstName();
        if (!StringUtils.isEmpty(firstName))
            node.setProperty(GraphUtils.PROPERTY_FIRST_NAME, firstName);

        String lastName = researcher.getLastName();
        if (!StringUtils.isEmpty(lastName))
            node.setProperty(GraphUtils.PROPERTY_LAST_NAME, lastName);

        String orcid = GraphUtils.extractOrcidId(researcher.getOrcid());
        if (!StringUtils.isEmpty(orcid))
            node.setProperty(GraphUtils.PROPERTY_ORCID_ID, orcid);

        String scopus = GraphUtils.extractScopusAuthorId(researcher.getScopusAuthorId());
        if (!StringUtils.isEmpty(scopus))
            node.setProperty(GraphUtils.PROPERTY_SCOPUS_ID, scopus);

        graph.addNode(node);

        return true;
    }

    private boolean processGrant(final Grant grant, final Graph graph, boolean deleted) {
        ++existingRecords;

        if (verbose)
            System.out.println("Processing Grant");

        String key = grant.getKey();
        if (StringUtils.isEmpty(key)) {
            return false;
        }

        if (verbose)
            System.out.println("Key: " + key);

        String source = grant.getSource();
        if (StringUtils.isEmpty(source))
            source = this.source;

        GraphNode node = GraphNode.builder().withKey(new GraphKey(source, key)).withNodeSource(source)
                .withNodeType(GraphUtils.TYPE_GRANT).withLabel(this.source).withLabel(GraphUtils.TYPE_GRANT)
                .build();

        if (deleted) {
            node.setDeleted(true);
            graph.addNode(node);

            ++deletedRecords;

            return true;
        }

        String localId = grant.getLocalId();
        if (!StringUtils.isEmpty(localId))
            node.setProperty(GraphUtils.PROPERTY_LOCAL_ID, localId);

        XMLGregorianCalendar lastUpdated = grant.getLastUpdated();
        if (null != lastUpdated) {
            String lastUpdatedString = formatter.format(lastUpdated.toGregorianCalendar().getTime());
            if (!StringUtils.isEmpty(lastUpdatedString))
                node.setProperty(GraphUtils.PROPERTY_LAST_UPDATED, lastUpdatedString);
        }

        String url = GraphUtils.extractFormalizedUrl(grant.getUrl());
        if (!StringUtils.isEmpty(url))
            node.setProperty(GraphUtils.PROPERTY_URL, url);

        String title = grant.getTitle();
        if (!StringUtils.isEmpty(title))
            node.setProperty(GraphUtils.PROPERTY_TITLE, title);

        String purl = GraphUtils.extractFormalizedUrl(grant.getPurl());
        if (!StringUtils.isEmpty(purl))
            node.setProperty(GraphUtils.PROPERTY_PURL, purl);

        String participantList = grant.getParticipantList();
        if (!StringUtils.isEmpty(participantList)) {
            String[] participants = participantList.trim().split("\\s*,\\s*");
            if (participants.length > 0)
                node.setProperty(GraphUtils.PROPERTY_PARTICIPANTS, participants);
        }

        String funder = GraphUtils.extractFormalizedUrl(grant.getFunder());
        if (!StringUtils.isEmpty(funder))
            node.setProperty(GraphUtils.PROPERTY_FUNDER, funder);

        XMLGregorianCalendar startYear = grant.getStartYear();
        if (null != startYear && startYear.getYear() > 0)
            node.setProperty(GraphUtils.PROPERTY_START_YEAR, startYear.getYear());

        XMLGregorianCalendar endYear = grant.getEndYear();
        if (null != endYear && endYear.getYear() > 0)
            node.setProperty(GraphUtils.PROPERTY_END_YEAR, endYear.getYear());

        graph.addNode(node);

        return true;
    }

    private boolean processDataset(final Dataset dataset, final Graph graph, boolean deleted) {
        ++existingRecords;

        if (verbose)
            System.out.println("Processing Dataset");

        String key = dataset.getKey();
        if (StringUtils.isEmpty(key)) {
            return false;
        }

        if (verbose)
            System.out.println("Key: " + key);

        String source = dataset.getSource();
        if (StringUtils.isEmpty(source))
            source = this.source;

        GraphNode node = GraphNode.builder().withKey(new GraphKey(source, key)).withNodeSource(source)
                .withNodeType(GraphUtils.TYPE_DATASET).withLabel(this.source).withLabel(GraphUtils.TYPE_DATASET)
                .build();

        if (deleted) {
            node.setDeleted(true);
            graph.addNode(node);

            ++deletedRecords;

            return true;
        }

        String localId = dataset.getLocalId();
        if (!StringUtils.isEmpty(localId))
            node.setProperty(GraphUtils.PROPERTY_LOCAL_ID, localId);

        XMLGregorianCalendar lastUpdated = dataset.getLastUpdated();
        if (null != lastUpdated) {
            String lastUpdatedString = formatter.format(lastUpdated.toGregorianCalendar().getTime());
            if (!StringUtils.isEmpty(lastUpdatedString))
                node.setProperty(GraphUtils.PROPERTY_LAST_UPDATED, lastUpdatedString);
        }

        String url = GraphUtils.extractFormalizedUrl(dataset.getUrl());
        if (!StringUtils.isEmpty(url))
            node.setProperty(GraphUtils.PROPERTY_URL, url);

        String title = dataset.getTitle();
        if (!StringUtils.isEmpty(title))
            node.setProperty(GraphUtils.PROPERTY_TITLE, title);

        String doi = GraphUtils.extractDoi(dataset.getDoi());
        if (!StringUtils.isEmpty(doi))
            node.setProperty(GraphUtils.PROPERTY_DOI, doi);

        XMLGregorianCalendar publicationYear = dataset.getPublicationYear();
        if (null != publicationYear && publicationYear.getYear() > 0)
            node.setProperty(GraphUtils.PROPERTY_PUBLICATION_YEAR, publicationYear.getYear());

        String license = GraphUtils.extractFormalizedUrl(dataset.getLicense());
        if (!StringUtils.isEmpty(license))
            node.setProperty(GraphUtils.PROPERTY_LICENSE, license);

        BigDecimal megabyte = dataset.getMegabyte();
        if (null != megabyte)
            node.setProperty(GraphUtils.PROPERTY_MEGABYTE, megabyte.toString());

        graph.addNode(node);

        return true;
    }

    private boolean processPublication(final Publication publication, final Graph graph, boolean deleted) {
        ++existingRecords;

        if (verbose)
            System.out.println("Processing Publication");

        String key = publication.getKey();
        if (StringUtils.isEmpty(key)) {
            return false;
        }

        if (verbose)
            System.out.println("Key: " + key);

        String source = publication.getSource();
        if (StringUtils.isEmpty(source))
            source = this.source;

        GraphNode node = GraphNode.builder().withKey(new GraphKey(source, key)).withNodeSource(source)
                .withNodeType(GraphUtils.TYPE_PUBLICATION).withLabel(this.source)
                .withLabel(GraphUtils.TYPE_PUBLICATION).build();

        if (deleted) {
            node.setDeleted(true);
            graph.addNode(node);

            ++deletedRecords;

            return true;
        }

        String localId = publication.getLocalId();
        if (!StringUtils.isEmpty(localId))
            node.setProperty(GraphUtils.PROPERTY_LOCAL_ID, localId);

        XMLGregorianCalendar lastUpdated = publication.getLastUpdated();
        if (null != lastUpdated) {
            String lastUpdatedString = formatter.format(lastUpdated.toGregorianCalendar().getTime());
            if (!StringUtils.isEmpty(lastUpdatedString))
                node.setProperty(GraphUtils.PROPERTY_LAST_UPDATED, lastUpdatedString);
        }

        String url = GraphUtils.extractFormalizedUrl(publication.getUrl());
        if (!StringUtils.isEmpty(url))
            node.setProperty(GraphUtils.PROPERTY_URL, url);

        String title = publication.getTitle();
        if (!StringUtils.isEmpty(title))
            node.setProperty(GraphUtils.PROPERTY_TITLE, title);

        String authorsList = publication.getAuthorsList();
        if (!StringUtils.isEmpty(authorsList))
            node.setProperty(GraphUtils.PROPERTY_AUTHORS, authorsList);

        String doi = GraphUtils.extractDoi(publication.getDoi());
        if (!StringUtils.isEmpty(doi))
            node.setProperty(GraphUtils.PROPERTY_DOI, doi);

        XMLGregorianCalendar publicationYear = publication.getPublicationYear();
        if (null != publicationYear && publicationYear.getYear() > 0)
            node.setProperty(GraphUtils.PROPERTY_PUBLICATION_YEAR, publicationYear.getYear());

        String scopusEid = GraphUtils.extractScopusEID(publication.getScopusEid());
        if (!StringUtils.isEmpty(scopusEid))
            node.setProperty(GraphUtils.PROPERTY_SCOPUS_EID, scopusEid);

        graph.addNode(node);

        return true;
    }

    private boolean processRelation(final Relation relation, final Graph graph) {
        if (verbose)
            System.out.println("Processing new Relation");

        String label = relation.getLabel();
        if (!StringUtils.isEmpty(label))
            label = GraphUtils.RELATIONSHIP_RELATED_TO;

        String from = relation.getFromKey();
        if (StringUtils.isEmpty(from)) {
            return false;
        }

        String to = relation.getToUri();
        if (StringUtils.isEmpty(to)) {
            return false;
        }

        GraphRelationship relationship = GraphRelationship.builder().withRelationship(label).withStart(source, from)
                .withEnd(source, to).build();

        graph.addRelationship(relationship);

        return true;
    }
}