org.dash.valid.gl.GLStringUtilities.java Source code

Java tutorial

Introduction

Here is the source code for org.dash.valid.gl.GLStringUtilities.java

Source

/*
    
Copyright (c) 2014-2015 National Marrow Donor Program (NMDP)
    
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
    
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.
    
You should have received a copy of the GNU Lesser General Public License
along with this library;  if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
    
> http://www.gnu.org/licenses/lgpl.html
    
*/
package org.dash.valid.gl;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.dash.valid.Locus;
import org.dash.valid.ars.AntigenRecognitionSiteLoader;
import org.dash.valid.cwd.CommonWellDocumentedLoader;
import org.dash.valid.gl.haplo.Haplotype;
import org.dash.valid.gl.haplo.MultiLocusHaplotype;
import org.dash.valid.gl.haplo.SingleLocusHaplotype;
import org.nmdp.gl.MultilocusUnphasedGenotype;
import org.nmdp.gl.client.GlClient;
import org.nmdp.gl.client.GlClientException;
import org.nmdp.gl.client.local.LocalGlClient;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class GLStringUtilities {
    private static final String ALPHA_REGEX = "[A-Z]";
    static final String GL_STRING_DELIMITER_REGEX = "[\\^\\|\\+~/]";
    private static final String FILE_DELIMITER_REGEX = "[\t,]";
    public static final String ESCAPED_ASTERISK = "\\*";
    public static final String VARIANTS_REGEX = "[SNLQ]";
    public static final String COLON = ":";
    public static final int P_GROUP_LEVEL = 2;

    private static final Logger LOGGER = Logger.getLogger(GLStringUtilities.class.getName());

    public static List<String> parse(String value, String delimiter) {
        List<String> elements = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(value, delimiter);
        while (st.hasMoreTokens()) {
            elements.add(st.nextToken());
        }

        return elements;
    }

    public static String getLatestImgtRelease() {
        HttpURLConnection connection = null;
        String imgtRelease = null;

        try {
            URL url = new URL("https://hml.nmdp.org/mac/api/imgtHlaReleases");

            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("GET");

            InputStream xml = connection.getInputStream();

            BufferedReader reader = new BufferedReader(new InputStreamReader(xml));
            imgtRelease = reader.readLine().split(GLStringConstants.SPACE)[0];
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            connection.disconnect();
        }

        return imgtRelease;
    }

    public static String decodeMAC(String typing) {
        String decodedValue = null;
        HttpURLConnection connection = null;

        try {
            String uri = "https://hml.nmdp.org/mac/api/decode/?";
            String imgtRelease = System.getProperty(GLStringConstants.HLADB_PROPERTY);
            if (imgtRelease == null || GLStringConstants.LATEST_HLADB.equals(imgtRelease)) {
                imgtRelease = getLatestImgtRelease();
                //System.setProperty(GLStringConstants.HLADB_PROPERTY, imgtRelease);
            }
            URL url = new URL(uri + "imgtHlaRelease=" + imgtRelease + "&typing=" + typing + "&expand=false");

            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestMethod("GET");

            InputStream xml = connection.getInputStream();

            BufferedReader reader = new BufferedReader(new InputStreamReader(xml));
            decodedValue = reader.readLine();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            connection.disconnect();
        }

        return decodedValue;
    }

    public static List<Haplotype> buildHaplotypes(LinkageDisequilibriumGenotypeList linkedGlString) {
        String glString = linkedGlString.getGLString();
        List<Haplotype> knownHaplotypes = new CopyOnWriteArrayList<Haplotype>();
        HashMap<String, Locus> locusMap = new HashMap<String, Locus>();
        Locus locus = null;

        if (StringUtils.countMatches(glString, GLStringConstants.GENE_PHASE_DELIMITER) > 1
                && StringUtils.countMatches(glString, GLStringConstants.GENE_COPY_DELIMITER) == 1) {
            List<String> genes = GLStringUtilities.parse(glString, GLStringConstants.GENE_DELIMITER);
            for (String gene : genes) {
                List<String> genotypeAmbiguities = GLStringUtilities.parse(gene,
                        GLStringConstants.GENOTYPE_AMBIGUITY_DELIMITER);
                for (String genotypeAmbiguity : genotypeAmbiguities) {
                    List<String> geneCopies = GLStringUtilities.parse(genotypeAmbiguity,
                            GLStringConstants.GENE_COPY_DELIMITER);

                    int i = 0;

                    for (String geneCopy : geneCopies) {
                        HashMap<Locus, SingleLocusHaplotype> singleLocusHaplotypes = new HashMap<Locus, SingleLocusHaplotype>();

                        List<String> genePhases = GLStringUtilities.parse(geneCopy,
                                GLStringConstants.GENE_PHASE_DELIMITER);
                        for (String genePhase : genePhases) {
                            String[] splitString = genePhase.split(GLStringUtilities.ESCAPED_ASTERISK);
                            String locusVal = splitString[0];

                            List<String> alleleAmbiguities = GLStringUtilities.parse(genePhase,
                                    GLStringConstants.ALLELE_AMBIGUITY_DELIMITER);

                            if (locusMap.containsKey(locusVal)) {
                                locus = locusMap.get(locusVal);
                            } else {
                                locus = Locus.normalizeLocus(Locus.lookup(locusVal));
                                locusMap.put(locusVal, locus);
                            }

                            SingleLocusHaplotype haplotype = new SingleLocusHaplotype(locus, alleleAmbiguities, i);
                            singleLocusHaplotypes.put(locus, haplotype);

                        }

                        MultiLocusHaplotype multiLocusHaplotype = new MultiLocusHaplotype(singleLocusHaplotypes,
                                linkedGlString.hasHomozygous(Locus.HLA_DRB345));
                        multiLocusHaplotype.setSequence(i + 1);
                        knownHaplotypes.add(multiLocusHaplotype);

                        i++;
                    }
                }
            }
        }

        return knownHaplotypes;
    }

    public static boolean validateGLStringFormat(String glString) {
        StringTokenizer st = new StringTokenizer(glString, GL_STRING_DELIMITER_REGEX);
        String token;
        while (st.hasMoreTokens()) {
            token = st.nextToken();
            String[] parts = token.split(COLON);
            LOGGER.finest(token);
            if (!token.startsWith(GLStringConstants.HLA_DASH)) {
                LOGGER.warning("GLString is invalid: " + glString);
                LOGGER.warning("Locus not qualified with " + GLStringConstants.HLA_DASH + " for segment: " + token);
                return false;
            }
            if (parts.length < P_GROUP_LEVEL && !GLStringConstants.NNNN.equals(parts)) {
                LOGGER.warning("GLString is invalid: " + glString);
                LOGGER.warning("Unexpected allele: " + token);
                return false;
            }

            if (parts[1].substring(0, 1).matches(ALPHA_REGEX)) {
                LOGGER.info("GLString contains allele codes.  These will be decoded.");
                return false;
            }
        }

        return true;
    }

    public static Set<String> checkCommonWellDocumented(String glString) {
        Set<String> notCommon = new HashSet<String>();

        CommonWellDocumentedLoader loader = CommonWellDocumentedLoader.getInstance();

        Set<String> cwdAlleles = loader.getCwdAlleles();

        if (cwdAlleles.size() == 0)
            return new HashSet<String>();

        HashMap<String, String> accessionMap = loader.getAccessionMap();

        StringTokenizer st = new StringTokenizer(glString, GL_STRING_DELIMITER_REGEX);
        String token;
        while (st.hasMoreTokens()) {
            token = st.nextToken();

            if (!cwdAlleles.contains(accessionMap.get(token))) {
                notCommon.add(token);
            }
        }

        return notCommon;
    }

    public static boolean fieldLevelComparison(String allele, String referenceAllele) {
        if (allele == null || referenceAllele == null) {
            return false;
        }

        String[] alleleParts = allele.split(COLON);
        String[] referenceAlleleParts = referenceAllele.split(COLON);

        int comparisonLength = (alleleParts.length < referenceAlleleParts.length) ? alleleParts.length
                : referenceAlleleParts.length;

        StringBuffer alleleBuffer = new StringBuffer();
        StringBuffer referenceAlleleBuffer = new StringBuffer();

        for (int i = 0; i < comparisonLength; i++) {
            alleleBuffer.append(alleleParts[i]);
            referenceAlleleBuffer.append(referenceAlleleParts[i]);
            if (i < comparisonLength - 1) {
                alleleBuffer.append(COLON);
                referenceAlleleBuffer.append(COLON);
            }
        }

        boolean match = alleleBuffer.toString().equals(referenceAlleleBuffer.toString());

        return match;
    }

    /**
     * @param locus
     * @param alleleBuffer
     * @param match
     * @return
     * @throws UnexpectedAlleleException
     */
    public static boolean checkAntigenRecognitionSite(String allele, String referenceAllele) {
        String matchedValue = convertToProteinLevel(allele);

        AntigenRecognitionSiteLoader instance = null;

        try {
            instance = AntigenRecognitionSiteLoader.getInstance();
        } catch (IOException | InvalidFormatException e) {
            LOGGER.warning("Could not load ars data.");
            e.printStackTrace();
        }

        HashMap<String, HashSet<String>> arsMap = instance.getArsMap();

        if (arsMap.containsKey(referenceAllele)) {
            for (String arsCode : arsMap.keySet()) {
                if (arsCode.equals(referenceAllele) && arsMap.get(arsCode).contains(matchedValue)) {
                    return true;
                }

                // TODO:  Not sure this accomplished anything...remove?

                //            else if (arsCode.substring(0, arsCode.length() - 1).equals(referenceAllele)
                //                  && arsMap.get(arsCode).contains(matchedValue)) {
                //               // TODO:  Does this ever happen?
                //               return true;
                //            }
            }
        }

        return false;
    }

    public static String convertToProteinLevel(String allele) {
        String[] parts = allele.split(COLON);

        String matchedValue = null;
        if (parts.length > P_GROUP_LEVEL
                && Pattern.matches(VARIANTS_REGEX, "" + allele.charAt(allele.length() - 1))) {
            matchedValue = parts[0] + COLON + parts[1] + allele.charAt(allele.length() - 1);
            LOGGER.finest("Found an SNLQ while comparing ARS: " + allele);
        } else if (parts.length < P_GROUP_LEVEL) {
            if (!allele.equals(GLStringConstants.NNNN)) {
                LOGGER.warning("Unexpected allele: " + allele);
            }
        } else {
            matchedValue = parts[0] + COLON + parts[1];
        }
        return matchedValue;
    }

    // TODO:  Fix homozygous checker - not dealing with genotypic ambiguity appropriately (S2 - DRB4 example)
    public static boolean checkHomozygous(List<List<String>> alleles) {
        if (alleles == null) {
            return false;
        }

        if (alleles.size() <= 1) {
            return true;
        }

        int i = 0;
        int j = 0;

        for (List<String> haplotypeAlleles : alleles) {
            j = 0;
            for (List<String> haplotypeAllelesLoop : alleles) {
                if (i != j && haplotypeAlleles.containsAll(haplotypeAllelesLoop)) {
                    return true;
                }
                j++;
            }
            i++;
        }

        return false;
    }

    public static String fullyQualifyGLString(String shorthand) {
        StringTokenizer st = new StringTokenizer(shorthand, GL_STRING_DELIMITER_REGEX, true);
        StringBuffer sb = new StringBuffer();
        String part;
        String[] segments;
        String locus = null;

        while (st.hasMoreTokens()) {
            part = st.nextToken();
            if (part.substring(0, 1).matches(ALPHA_REGEX)) {
                if (!part.startsWith(GLStringConstants.HLA_DASH)) {
                    part = GLStringConstants.HLA_DASH + part;
                }

                String[] splitString = part.split(ESCAPED_ASTERISK);
                locus = splitString[0];
            } else if (part.substring(0, 1).matches(GL_STRING_DELIMITER_REGEX)) {
                sb.append(part);
                continue;
            } else {
                part = fillLocus(Locus.lookup(locus), part);
            }

            segments = part.split(COLON);

            if (segments.length > 1 && segments[1].substring(0, 1).matches(ALPHA_REGEX)) {
                part = decodeMAC(part);
            }

            sb.append(part);
        }

        return sb.toString();
    }

    public static String fillLocus(Locus locus, String segment) {
        if (!segment.substring(0, 1).matches(ALPHA_REGEX)) {
            segment = locus + GLStringConstants.ASTERISK + segment;
        }
        return segment;
    }

    public static List<LinkageDisequilibriumGenotypeList> readGLStringFile(String name, BufferedReader reader) {
        List<LinkageDisequilibriumGenotypeList> linkedGLStrings = null;

        try {
            linkedGLStrings = parseGLStringFile(name, reader);
        } catch (IOException e) {
            LOGGER.severe("Problem reading GL String file: " + name);
            e.printStackTrace();
        } catch (ParserConfigurationException | SAXException e) {
            LOGGER.severe("Couldn't parse xml file: " + name);
            e.printStackTrace();
        }

        return linkedGLStrings;
    }

    public static List<LinkageDisequilibriumGenotypeList> readGLStringFile(String filename) {
        BufferedReader reader = null;
        List<LinkageDisequilibriumGenotypeList> linkedGLStrings = null;

        try {
            InputStream stream = GLStringUtilities.class.getClassLoader().getResourceAsStream(filename);
            if (stream == null) {
                stream = new FileInputStream(filename);
            }

            reader = new BufferedReader(new InputStreamReader(stream));

            linkedGLStrings = parseGLStringFile(filename, reader);

        } catch (FileNotFoundException e) {
            LOGGER.severe("Couldn't find GL String file: " + filename);
            e.printStackTrace();
        } catch (IOException e) {
            LOGGER.severe("Problem opening GL String file: " + filename);
            e.printStackTrace();
        } catch (SAXException | ParserConfigurationException e) {
            LOGGER.severe("Couldn't parse xml file: " + filename);
            e.printStackTrace();
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.severe("Problem closing reader/stream.");
                e.printStackTrace();
            }
        }

        return linkedGLStrings;
    }

    private static List<LinkageDisequilibriumGenotypeList> parseGLStringFile(String filename, BufferedReader reader)
            throws IOException, ParserConfigurationException, SAXException {
        List<LinkageDisequilibriumGenotypeList> linkedGLStrings = new ArrayList<LinkageDisequilibriumGenotypeList>();

        if (filename.endsWith(GLStringConstants.XML) || filename.endsWith(GLStringConstants.HML)) {
            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = factory.newDocumentBuilder();
            InputSource is = new InputSource(reader);
            Document doc = builder.parse(is);
            String sampleId;
            Element alleleAssignment;

            NodeList nList = doc.getElementsByTagName(GLStringConstants.SAMPLE_ELEMENT);
            for (int i = 0; i < nList.getLength(); i++) {
                sampleId = nList.item(i).getAttributes().getNamedItem(GLStringConstants.ID_ATTRIBUTE)
                        .getNodeValue();
                StringBuffer glString = new StringBuffer();
                NodeList typingElements = ((Element) nList.item(i))
                        .getElementsByTagName(GLStringConstants.TYPING_ELEMENT);
                for (int j = 0; j < typingElements.getLength(); j++) {
                    alleleAssignment = (Element) ((Element) typingElements.item(j))
                            .getElementsByTagName(GLStringConstants.ALLELE_ASSIGNMENT_ELEMENT).item(0);
                    if (j > 0)
                        glString.append(GLStringConstants.GENE_DELIMITER);
                    glString.append(
                            ((Element) alleleAssignment.getElementsByTagName(GLStringConstants.GL_STRING_ELEMENT)
                                    .item(0)).getTextContent().trim());
                }

                linkedGLStrings.add(inflateGenotypeList(sampleId, glString.toString(), null));
            }
        } else {
            String line;
            String[] parts = null;
            int lineNumber = 0;
            String glString;
            String id;
            String note = null;

            while ((line = reader.readLine()) != null) {
                lineNumber++;

                parts = line.split(FILE_DELIMITER_REGEX);

                if (parts.length == 1) {
                    id = filename + "-" + (lineNumber - 1);
                    glString = parts[0];
                } else if (parts.length >= 2) {
                    id = parts[0];
                    glString = parts[1];

                    if (parts.length == 3)
                        note = parts[2];
                } else {
                    LOGGER.warning("Unexpected line format at line " + (lineNumber - 1) + ": " + filename);

                    continue;
                }

                linkedGLStrings.add(inflateGenotypeList(id, glString, note));

            }
        }

        return linkedGLStrings;
    }

    private static LinkageDisequilibriumGenotypeList inflateGenotypeList(String id, String glString, String note) {
        LinkageDisequilibriumGenotypeList linkedGLString;

        String submittedGlString = glString;

        if (!GLStringUtilities.validateGLStringFormat(glString)) {
            glString = GLStringUtilities.fullyQualifyGLString(glString);
        }

        MultilocusUnphasedGenotype mug = GLStringUtilities.convertToMug(glString);
        linkedGLString = new LinkageDisequilibriumGenotypeList(id, mug);

        linkedGLString.setSubmittedGlString(submittedGlString);
        linkedGLString.setNote(note);

        return linkedGLString;
    }

    public static MultilocusUnphasedGenotype convertToMug(String glString) {
        MultilocusUnphasedGenotype mug = null;

        try {
            // TODO: should use strict but example GL Strings are missing intron
            // variants in some cases (HLA-DQB1*02:02)
            // GlClient glClient = LocalGlClient.createStrict();

            GlClient glClient = LocalGlClient.create();
            mug = glClient.createMultilocusUnphasedGenotype(glString);
        } catch (GlClientException e) {
            LOGGER.severe("Couldn't convert GLString to MultiLocusUnphasedGenotype");
            e.printStackTrace();
        }

        return mug;
    }
}