ca.uhn.fhir.jpa.term.TerminologyLoaderSvc.java Source code

Java tutorial

Introduction

Here is the source code for ca.uhn.fhir.jpa.term.TerminologyLoaderSvc.java

Source

package ca.uhn.fhir.jpa.term;

import static org.apache.commons.lang3.StringUtils.isNotBlank;

/*
 * #%L
 * HAPI FHIR JPA Server
 * %%
 * Copyright (C) 2014 - 2016 University Health Network
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.springframework.beans.factory.annotation.Autowired;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;

import ca.uhn.fhir.jpa.entity.TermCodeSystemVersion;
import ca.uhn.fhir.jpa.entity.TermConcept;
import ca.uhn.fhir.jpa.entity.TermConceptParentChildLink;
import ca.uhn.fhir.jpa.entity.TermConceptParentChildLink.RelationshipTypeEnum;
import ca.uhn.fhir.jpa.util.Counter;
import ca.uhn.fhir.rest.method.RequestDetails;
import ca.uhn.fhir.rest.server.exceptions.InternalErrorException;
import ca.uhn.fhir.rest.server.exceptions.InvalidRequestException;

public class TerminologyLoaderSvc implements IHapiTerminologyLoaderSvc {
    private static final int LOG_INCREMENT = 100000;

    public static final String LOINC_FILE = "loinc.csv";

    public static final String LOINC_HIERARCHY_FILE = "MULTI-AXIAL_HIERARCHY.CSV";
    private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(TerminologyLoaderSvc.class);

    public static final String SCT_FILE_CONCEPT = "Terminology/sct2_Concept_Full_";
    public static final String SCT_FILE_DESCRIPTION = "Terminology/sct2_Description_Full-en";
    public static final String SCT_FILE_RELATIONSHIP = "Terminology/sct2_Relationship_Full";
    @Autowired
    private IHapiTerminologySvc myTermSvc;

    private void dropCircularRefs(TermConcept theConcept, ArrayList<String> theChain,
            Map<String, TermConcept> theCode2concept, Counter theCircularCounter) {

        theChain.add(theConcept.getCode());
        for (Iterator<TermConceptParentChildLink> childIter = theConcept.getChildren().iterator(); childIter
                .hasNext();) {
            TermConceptParentChildLink next = childIter.next();
            TermConcept nextChild = next.getChild();
            if (theChain.contains(nextChild.getCode())) {

                StringBuilder b = new StringBuilder();
                b.append("Removing circular reference code ");
                b.append(nextChild.getCode());
                b.append(" from parent ");
                b.append(next.getParent().getCode());
                b.append(". Chain was: ");
                for (String nextInChain : theChain) {
                    TermConcept nextCode = theCode2concept.get(nextInChain);
                    b.append(nextCode.getCode());
                    b.append('[');
                    b.append(StringUtils.substring(nextCode.getDisplay(), 0, 20).replace("[", "").replace("]", "")
                            .trim());
                    b.append("] ");
                }
                ourLog.info(b.toString(), theConcept.getCode());
                childIter.remove();
                nextChild.getParents().remove(next);

            } else {
                dropCircularRefs(nextChild, theChain, theCode2concept, theCircularCounter);
            }
        }
        theChain.remove(theChain.size() - 1);

    }

    private void extractFiles(List<byte[]> theZipBytes, List<String> theExpectedFilenameFragments) {
        Set<String> foundFragments = new HashSet<String>();

        for (byte[] nextZipBytes : theZipBytes) {
            ZipInputStream zis = new ZipInputStream(
                    new BufferedInputStream(new ByteArrayInputStream(nextZipBytes)));
            try {
                for (ZipEntry nextEntry; (nextEntry = zis.getNextEntry()) != null;) {
                    for (String next : theExpectedFilenameFragments) {
                        if (nextEntry.getName().contains(next)) {
                            foundFragments.add(next);
                        }
                    }
                }
            } catch (IOException e) {
                throw new InternalErrorException(e);
            } finally {
                IOUtils.closeQuietly(zis);
            }
        }

        for (String next : theExpectedFilenameFragments) {
            if (!foundFragments.contains(next)) {
                throw new InvalidRequestException(
                        "Invalid input zip file, expected zip to contain the following name fragments: "
                                + theExpectedFilenameFragments + " but found: " + foundFragments);
            }
        }

    }

    public String firstNonBlank(String... theStrings) {
        String retVal = "";
        for (String nextString : theStrings) {
            if (isNotBlank(nextString)) {
                retVal = nextString;
                break;
            }
        }
        return retVal;
    }

    private TermConcept getOrCreateConcept(TermCodeSystemVersion codeSystemVersion,
            Map<String, TermConcept> id2concept, String id) {
        TermConcept concept = id2concept.get(id);
        if (concept == null) {
            concept = new TermConcept();
            id2concept.put(id, concept);
            concept.setCodeSystem(codeSystemVersion);
        }
        return concept;
    }

    private void iterateOverZipFile(List<byte[]> theZipBytes, String fileNamePart, IRecordHandler handler,
            char theDelimiter, QuoteMode theQuoteMode) {
        boolean found = false;

        for (byte[] nextZipBytes : theZipBytes) {
            ZipInputStream zis = new ZipInputStream(
                    new BufferedInputStream(new ByteArrayInputStream(nextZipBytes)));
            try {
                for (ZipEntry nextEntry; (nextEntry = zis.getNextEntry()) != null;) {
                    ZippedFileInputStream inputStream = new ZippedFileInputStream(zis);

                    String nextFilename = nextEntry.getName();
                    if (nextFilename.contains(fileNamePart)) {
                        ourLog.info("Processing file {}", nextFilename);
                        found = true;

                        Reader reader = null;
                        CSVParser parsed = null;
                        try {
                            reader = new InputStreamReader(zis, Charsets.UTF_8);
                            CSVFormat format = CSVFormat.newFormat(theDelimiter).withFirstRecordAsHeader();
                            if (theQuoteMode != null) {
                                format = format.withQuote('"').withQuoteMode(theQuoteMode);
                            }
                            parsed = new CSVParser(reader, format);
                            Iterator<CSVRecord> iter = parsed.iterator();
                            ourLog.debug("Header map: {}", parsed.getHeaderMap());

                            int count = 0;
                            int logIncrement = LOG_INCREMENT;
                            int nextLoggedCount = 0;
                            while (iter.hasNext()) {
                                CSVRecord nextRecord = iter.next();
                                handler.accept(nextRecord);
                                count++;
                                if (count >= nextLoggedCount) {
                                    ourLog.info(" * Processed {} records in {}", count, nextFilename);
                                    nextLoggedCount += logIncrement;
                                }
                            }

                        } catch (IOException e) {
                            throw new InternalErrorException(e);
                        }
                    }
                }
            } catch (IOException e) {
                throw new InternalErrorException(e);
            } finally {
                IOUtils.closeQuietly(zis);
            }
        }

        // This should always be true, but just in case we've introduced a bug...
        Validate.isTrue(found);
    }

    @Override
    public UploadStatistics loadLoinc(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
        List<String> expectedFilenameFragments = Arrays.asList(LOINC_FILE, LOINC_HIERARCHY_FILE);

        extractFiles(theZipBytes, expectedFilenameFragments);

        ourLog.info("Beginning LOINC processing");

        return processLoincFiles(theZipBytes, theRequestDetails);
    }

    @Override
    public UploadStatistics loadSnomedCt(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
        List<String> expectedFilenameFragments = Arrays.asList(SCT_FILE_DESCRIPTION, SCT_FILE_RELATIONSHIP,
                SCT_FILE_CONCEPT);

        extractFiles(theZipBytes, expectedFilenameFragments);

        ourLog.info("Beginning SNOMED CT processing");

        return processSnomedCtFiles(theZipBytes, theRequestDetails);
    }

    UploadStatistics processLoincFiles(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
        final TermCodeSystemVersion codeSystemVersion = new TermCodeSystemVersion();
        final Map<String, TermConcept> code2concept = new HashMap<String, TermConcept>();

        IRecordHandler handler = new LoincHandler(codeSystemVersion, code2concept);
        iterateOverZipFile(theZipBytes, LOINC_FILE, handler, ',', QuoteMode.NON_NUMERIC);

        handler = new LoincHierarchyHandler(codeSystemVersion, code2concept);
        iterateOverZipFile(theZipBytes, LOINC_HIERARCHY_FILE, handler, ',', QuoteMode.NON_NUMERIC);

        theZipBytes.clear();

        for (Iterator<Entry<String, TermConcept>> iter = code2concept.entrySet().iterator(); iter.hasNext();) {
            Entry<String, TermConcept> next = iter.next();
            // if (isBlank(next.getKey())) {
            // ourLog.info("Removing concept with blankc code[{}] and display [{}", next.getValue().getCode(), next.getValue().getDisplay());
            // iter.remove();
            // continue;
            // }
            TermConcept nextConcept = next.getValue();
            if (nextConcept.getParents().isEmpty()) {
                codeSystemVersion.getConcepts().add(nextConcept);
            }
        }

        ourLog.info("Have {} total concepts, {} root concepts", code2concept.size(),
                codeSystemVersion.getConcepts().size());

        String url = LOINC_URL;
        storeCodeSystem(theRequestDetails, codeSystemVersion, url);

        return new UploadStatistics(code2concept.size());
    }

    private void storeCodeSystem(RequestDetails theRequestDetails, final TermCodeSystemVersion codeSystemVersion,
            String url) {
        myTermSvc.setProcessDeferred(false);
        myTermSvc.storeNewCodeSystemVersion(url, codeSystemVersion, theRequestDetails);
        myTermSvc.setProcessDeferred(true);
    }

    UploadStatistics processSnomedCtFiles(List<byte[]> theZipBytes, RequestDetails theRequestDetails) {
        final TermCodeSystemVersion codeSystemVersion = new TermCodeSystemVersion();
        final Map<String, TermConcept> id2concept = new HashMap<String, TermConcept>();
        final Map<String, TermConcept> code2concept = new HashMap<String, TermConcept>();
        final Set<String> validConceptIds = new HashSet<String>();

        IRecordHandler handler = new SctHandlerConcept(validConceptIds);
        iterateOverZipFile(theZipBytes, SCT_FILE_CONCEPT, handler, '\t', null);

        ourLog.info("Have {} valid concept IDs", validConceptIds.size());

        handler = new SctHandlerDescription(validConceptIds, code2concept, id2concept, codeSystemVersion);
        iterateOverZipFile(theZipBytes, SCT_FILE_DESCRIPTION, handler, '\t', null);

        ourLog.info("Got {} concepts, cloning map", code2concept.size());
        final HashMap<String, TermConcept> rootConcepts = new HashMap<String, TermConcept>(code2concept);

        handler = new SctHandlerRelationship(codeSystemVersion, rootConcepts, code2concept);
        iterateOverZipFile(theZipBytes, SCT_FILE_RELATIONSHIP, handler, '\t', null);

        theZipBytes.clear();

        ourLog.info("Looking for root codes");
        for (Iterator<Entry<String, TermConcept>> iter = rootConcepts.entrySet().iterator(); iter.hasNext();) {
            if (iter.next().getValue().getParents().isEmpty() == false) {
                iter.remove();
            }
        }

        ourLog.info("Done loading SNOMED CT files - {} root codes, {} total codes", rootConcepts.size(),
                code2concept.size());

        Counter circularCounter = new Counter();
        for (TermConcept next : rootConcepts.values()) {
            long count = circularCounter.getThenAdd();
            float pct = ((float) count / rootConcepts.size()) * 100.0f;
            ourLog.info(" * Scanning for circular refs - have scanned {} / {} codes ({}%)", count,
                    rootConcepts.size(), pct);
            dropCircularRefs(next, new ArrayList<String>(), code2concept, circularCounter);
        }

        codeSystemVersion.getConcepts().addAll(rootConcepts.values());
        String url = SCT_URL;
        storeCodeSystem(theRequestDetails, codeSystemVersion, url);

        return new UploadStatistics(code2concept.size());
    }

    @VisibleForTesting
    void setTermSvcForUnitTests(IHapiTerminologySvc theTermSvc) {
        myTermSvc = theTermSvc;
    }

    private interface IRecordHandler {
        void accept(CSVRecord theRecord);
    }

    public class LoincHandler implements IRecordHandler {

        private final Map<String, TermConcept> myCode2Concept;
        private final TermCodeSystemVersion myCodeSystemVersion;

        public LoincHandler(TermCodeSystemVersion theCodeSystemVersion, Map<String, TermConcept> theCode2concept) {
            myCodeSystemVersion = theCodeSystemVersion;
            myCode2Concept = theCode2concept;
        }

        @Override
        public void accept(CSVRecord theRecord) {
            String code = theRecord.get("LOINC_NUM");
            if (isNotBlank(code)) {
                String longCommonName = theRecord.get("LONG_COMMON_NAME");
                String shortName = theRecord.get("SHORTNAME");
                String consumerName = theRecord.get("CONSUMER_NAME");
                String display = firstNonBlank(longCommonName, shortName, consumerName);

                TermConcept concept = new TermConcept(myCodeSystemVersion, code);
                concept.setDisplay(display);

                Validate.isTrue(!myCode2Concept.containsKey(code));
                myCode2Concept.put(code, concept);
            }
        }

    }

    public class LoincHierarchyHandler implements IRecordHandler {

        private Map<String, TermConcept> myCode2Concept;
        private TermCodeSystemVersion myCodeSystemVersion;

        public LoincHierarchyHandler(TermCodeSystemVersion theCodeSystemVersion,
                Map<String, TermConcept> theCode2concept) {
            myCodeSystemVersion = theCodeSystemVersion;
            myCode2Concept = theCode2concept;
        }

        @Override
        public void accept(CSVRecord theRecord) {
            String parentCode = theRecord.get("IMMEDIATE_PARENT");
            String childCode = theRecord.get("CODE");
            String childCodeText = theRecord.get("CODE_TEXT");

            if (isNotBlank(parentCode) && isNotBlank(childCode)) {
                TermConcept parent = getOrCreate(parentCode, "(unknown)");
                TermConcept child = getOrCreate(childCode, childCodeText);

                parent.addChild(child, RelationshipTypeEnum.ISA);
            }
        }

        private TermConcept getOrCreate(String theCode, String theDisplay) {
            TermConcept retVal = myCode2Concept.get(theCode);
            if (retVal == null) {
                retVal = new TermConcept();
                retVal.setCodeSystem(myCodeSystemVersion);
                retVal.setCode(theCode);
                retVal.setDisplay(theDisplay);
                myCode2Concept.put(theCode, retVal);
            }
            return retVal;
        }

    }

    private final class SctHandlerConcept implements IRecordHandler {

        private Set<String> myValidConceptIds;
        private Map<String, String> myConceptIdToMostRecentDate = new HashMap<String, String>();

        public SctHandlerConcept(Set<String> theValidConceptIds) {
            myValidConceptIds = theValidConceptIds;
        }

        @Override
        public void accept(CSVRecord theRecord) {
            String id = theRecord.get("id");
            String date = theRecord.get("effectiveTime");

            if (!myConceptIdToMostRecentDate.containsKey(id)
                    || myConceptIdToMostRecentDate.get(id).compareTo(date) < 0) {
                boolean active = "1".equals(theRecord.get("active"));
                if (active) {
                    myValidConceptIds.add(id);
                } else {
                    myValidConceptIds.remove(id);
                }
                myConceptIdToMostRecentDate.put(id, date);
            }

        }
    }

    private final class SctHandlerDescription implements IRecordHandler {
        private final Map<String, TermConcept> myCode2concept;
        private final TermCodeSystemVersion myCodeSystemVersion;
        private final Map<String, TermConcept> myId2concept;
        private Set<String> myValidConceptIds;

        private SctHandlerDescription(Set<String> theValidConceptIds, Map<String, TermConcept> theCode2concept,
                Map<String, TermConcept> theId2concept, TermCodeSystemVersion theCodeSystemVersion) {
            myCode2concept = theCode2concept;
            myId2concept = theId2concept;
            myCodeSystemVersion = theCodeSystemVersion;
            myValidConceptIds = theValidConceptIds;
        }

        @Override
        public void accept(CSVRecord theRecord) {
            String id = theRecord.get("id");
            boolean active = "1".equals(theRecord.get("active"));
            if (!active) {
                return;
            }
            String conceptId = theRecord.get("conceptId");
            if (!myValidConceptIds.contains(conceptId)) {
                return;
            }

            String term = theRecord.get("term");

            TermConcept concept = getOrCreateConcept(myCodeSystemVersion, myId2concept, id);
            concept.setCode(conceptId);
            concept.setDisplay(term);
            myCode2concept.put(conceptId, concept);
        }
    }

    private final class SctHandlerRelationship implements IRecordHandler {
        private final Map<String, TermConcept> myCode2concept;
        private final TermCodeSystemVersion myCodeSystemVersion;
        private final Map<String, TermConcept> myRootConcepts;

        private SctHandlerRelationship(TermCodeSystemVersion theCodeSystemVersion,
                HashMap<String, TermConcept> theRootConcepts, Map<String, TermConcept> theCode2concept) {
            myCodeSystemVersion = theCodeSystemVersion;
            myRootConcepts = theRootConcepts;
            myCode2concept = theCode2concept;
        }

        @Override
        public void accept(CSVRecord theRecord) {
            Set<String> ignoredTypes = new HashSet<String>();
            ignoredTypes.add("Method (attribute)");
            ignoredTypes.add("Direct device (attribute)");
            ignoredTypes.add("Has focus (attribute)");
            ignoredTypes.add("Access instrument");
            ignoredTypes.add("Procedure site (attribute)");
            ignoredTypes.add("Causative agent (attribute)");
            ignoredTypes.add("Course (attribute)");
            ignoredTypes.add("Finding site (attribute)");
            ignoredTypes.add("Has definitional manifestation (attribute)");

            String sourceId = theRecord.get("sourceId");
            String destinationId = theRecord.get("destinationId");
            String typeId = theRecord.get("typeId");
            boolean active = "1".equals(theRecord.get("active"));

            TermConcept typeConcept = myCode2concept.get(typeId);
            TermConcept sourceConcept = myCode2concept.get(sourceId);
            TermConcept targetConcept = myCode2concept.get(destinationId);
            if (sourceConcept != null && targetConcept != null && typeConcept != null) {
                if (typeConcept.getDisplay().equals("Is a (attribute)")) {
                    RelationshipTypeEnum relationshipType = RelationshipTypeEnum.ISA;
                    if (!sourceId.equals(destinationId)) {
                        if (active) {
                            TermConceptParentChildLink link = new TermConceptParentChildLink();
                            link.setChild(sourceConcept);
                            link.setParent(targetConcept);
                            link.setRelationshipType(relationshipType);
                            link.setCodeSystem(myCodeSystemVersion);

                            targetConcept.addChild(sourceConcept, relationshipType);
                        } else {
                            // not active, so we're removing any existing links
                            for (TermConceptParentChildLink next : new ArrayList<TermConceptParentChildLink>(
                                    targetConcept.getChildren())) {
                                if (next.getRelationshipType() == relationshipType) {
                                    if (next.getChild().getCode().equals(sourceConcept.getCode())) {
                                        next.getParent().getChildren().remove(next);
                                        next.getChild().getParents().remove(next);
                                    }
                                }
                            }
                        }
                    }
                } else if (ignoredTypes.contains(typeConcept.getDisplay())) {
                    // ignore
                } else {
                    // ourLog.warn("Unknown relationship type: {}/{}", typeId, typeConcept.getDisplay());
                }
            }
        }

    }

    private static class ZippedFileInputStream extends InputStream {

        private ZipInputStream is;

        public ZippedFileInputStream(ZipInputStream is) {
            this.is = is;
        }

        @Override
        public void close() throws IOException {
            is.closeEntry();
        }

        @Override
        public int read() throws IOException {
            return is.read();
        }
    }

}