org.dash.valid.freq.HLAFrequenciesLoader.java Source code

Java tutorial

Introduction

Here is the source code for org.dash.valid.freq.HLAFrequenciesLoader.java

Source

/*
    
Copyright (c) 2014-2015 National Marrow Donor Program (NMDP)
    
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
    
This library is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.
    
You should have received a copy of the GNU Lesser General Public License
along with this library;  if not, write to the Free Software Foundation,
Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA.
    
> http://www.gnu.org/licenses/lgpl.html
    
*/
package org.dash.valid.freq;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.dash.valid.DisequilibriumElement;
import org.dash.valid.Linkages;
import org.dash.valid.LinkagesLoader;
import org.dash.valid.Locus;
import org.dash.valid.base.BaseDisequilibriumElement;
import org.dash.valid.gl.GLStringConstants;
import org.dash.valid.gl.GLStringUtilities;
import org.dash.valid.race.DisequilibriumElementByRace;
import org.dash.valid.race.FrequencyByRace;
import org.dash.valid.race.FrequencyByRaceComparator;

public class HLAFrequenciesLoader {
    private HashMap<EnumSet<Locus>, List<DisequilibriumElement>> disequilibriumElementsMap = new HashMap<EnumSet<Locus>, List<DisequilibriumElement>>();
    private final HashMap<Locus, List<String>> individualLocusFrequencies = new HashMap<Locus, List<String>>();

    private static final String UNDERSCORE = "_";

    private static final String WIKIVERSITY_BC_FREQUENCIES = "frequencies/wikiversity/BCLinkageDisequilibrium.txt";
    private static final String WIKIVERSITY_DRDQ_FREQUENCIES = "frequencies/wikiversity/DRDQLinkageDisequilibrium.txt";

    private static final String NMDP_ABC_FREQUENCIES = "frequencies/nmdp/A~C~B.xlsx";
    private static final String NMDP_BC_FREQUENCIES = "frequencies/nmdp/C~B.xlsx";
    private static final String NMDP_DRDQ_FREQUENCIES = "frequencies/nmdp/DRB3-4-5~DRB1~DQB1.xlsx";
    private static final String NMDP_FIVE_LOCUS_FREQUENCIES = "frequencies/nmdp/A~C~B~DRB1~DQB1.xlsx";

    private static final String NMDP_2007_ABC_FREQUENCIES = "frequencies/nmdp-2007/ACB.xls";
    private static final String NMDP_2007_BC_FREQUENCIES = "frequencies/nmdp-2007/CB.xls";
    private static final String NMDP_2007_DRB1DQB1_FREQUENCIES = "frequencies/nmdp-2007/DRB1DQB1.xls";
    private static final String NMDP_2007_FIVE_LOCUS_FREQUENCIES = "frequencies/nmdp-2007/ACBDRB1DQB1.xls";

    private static final String NMDP_STD_FIVE_LOCUS_FREQUENCIES = "frequencies/nmdp-std/2015_ALL.csv";

    private static final Locus[] BASE_BC_LOCI_POS = new Locus[] { Locus.HLA_B, Locus.HLA_C };
    private static final Locus[] BASE_DRDQ_LOCI_POS = new Locus[] { Locus.HLA_DRB1, Locus.HLA_DRB345,
            Locus.HLA_DQA1, Locus.HLA_DQB1 };
    private static final Locus[] NMDP_ABC_LOCI_POS = new Locus[] { Locus.HLA_A, Locus.HLA_C, Locus.HLA_B };
    private static final Locus[] NMDP_BC_LOCI_POS = new Locus[] { Locus.HLA_C, Locus.HLA_B };
    private static final Locus[] NMDP_DRB1DQB1_LOCI_POS = new Locus[] { Locus.HLA_DRB1, Locus.HLA_DQB1 };
    private static final Locus[] NMDP_DRDQB1_LOCI_POS = new Locus[] { Locus.HLA_DRB345, Locus.HLA_DRB1,
            Locus.HLA_DQB1 };
    private static final Locus[] NMDP_FIVE_LOCUS_POS = new Locus[] { Locus.HLA_A, Locus.HLA_C, Locus.HLA_B,
            Locus.HLA_DRB1, Locus.HLA_DQB1 };

    private static HLAFrequenciesLoader instance = null;

    private static final Logger LOGGER = Logger.getLogger(HLAFrequenciesLoader.class.getName());

    private HLAFrequenciesLoader() {

    }

    public static HLAFrequenciesLoader getInstance() throws IOException {
        if (instance == null) {
            instance = new HLAFrequenciesLoader();
            Frequencies freq = Frequencies.lookup(System.getProperty(Frequencies.FREQUENCIES_PROPERTY));

            instance.init(freq);
        }

        return instance;
    }

    public boolean hasIndividualFrequency(Locus locus) {
        HashMap<Locus, List<String>> individualFrequencies = getIndividualLocusFrequencies();

        if (individualFrequencies != null && individualFrequencies.size() > 0
                && individualFrequencies.get(locus) != null) {
            return true;
        }

        return false;
    }

    public String hasFrequency(Locus locus, String allele) {
        HashMap<Locus, List<String>> individualFrequencies = getIndividualLocusFrequencies();
        if (individualFrequencies == null || individualFrequencies.get(locus) == null) {
            return null;
        }

        for (String alleleWithFrequency : individualFrequencies.get(locus)) {
            if (GLStringUtilities.fieldLevelComparison(allele, alleleWithFrequency) != null
                    || GLStringUtilities.checkAntigenRecognitionSite(allele, alleleWithFrequency) != null) {
                return alleleWithFrequency;
            }
        }

        return null;
    }

    public void reloadFrequencies() throws IOException {
        Frequencies freq = Frequencies.lookup(System.getProperty(Frequencies.FREQUENCIES_PROPERTY));

        this.disequilibriumElementsMap = new HashMap<EnumSet<Locus>, List<DisequilibriumElement>>();

        init(freq);
    }

    private void init(Frequencies freq) throws IOException {
        try {
            switch (freq) {
            case NMDP_2007:
                for (Linkages linkage : LinkagesLoader.getInstance().getLinkages()) {
                    switch (linkage) {
                    case A_B_C:
                        this.disequilibriumElementsMap.put(Locus.A_B_C_LOCI,
                                loadNMDPLinkageReferenceData(NMDP_2007_ABC_FREQUENCIES, NMDP_ABC_LOCI_POS));
                    case B_C:
                        this.disequilibriumElementsMap.put(Locus.B_C_LOCI,
                                loadNMDPLinkageReferenceData(NMDP_2007_BC_FREQUENCIES, NMDP_BC_LOCI_POS));
                    case DRB1_DQB1:
                        this.disequilibriumElementsMap.put(Locus.DRB1_DQB1_LOCI, loadNMDPLinkageReferenceData(
                                NMDP_2007_DRB1DQB1_FREQUENCIES, NMDP_DRB1DQB1_LOCI_POS));
                    case FIVE_LOCUS:
                        this.disequilibriumElementsMap.put(Locus.FIVE_LOCUS, loadNMDPLinkageReferenceData(
                                NMDP_2007_FIVE_LOCUS_FREQUENCIES, NMDP_FIVE_LOCUS_POS));
                    default:
                        break;
                    }
                }
                loadIndividualLocusFrequencies(freq);
                break;
            case NMDP:
                for (Linkages linkage : LinkagesLoader.getInstance().getLinkages()) {
                    switch (linkage) {
                    case A_B_C:
                        this.disequilibriumElementsMap.put(Locus.A_B_C_LOCI,
                                loadNMDPLinkageReferenceData(NMDP_ABC_FREQUENCIES, NMDP_ABC_LOCI_POS));
                    case B_C:
                        this.disequilibriumElementsMap.put(Locus.B_C_LOCI,
                                loadNMDPLinkageReferenceData(NMDP_BC_FREQUENCIES, NMDP_BC_LOCI_POS));
                    case DRB_DQB:
                        this.disequilibriumElementsMap.put(Locus.DRB_DQB_LOCI,
                                loadNMDPLinkageReferenceData(NMDP_DRDQ_FREQUENCIES, NMDP_DRDQB1_LOCI_POS));
                    case FIVE_LOCUS:
                        this.disequilibriumElementsMap.put(Locus.FIVE_LOCUS,
                                loadNMDPLinkageReferenceData(NMDP_FIVE_LOCUS_FREQUENCIES, NMDP_FIVE_LOCUS_POS));
                    default:
                        break;
                    }
                }
                loadIndividualLocusFrequencies(freq);
                break;
            case NMDP_STD:
                for (Linkages linkage : LinkagesLoader.getInstance().getLinkages()) {
                    switch (linkage) {
                    case FIVE_LOCUS:
                        this.disequilibriumElementsMap.put(Locus.FIVE_LOCUS,
                                loadStandardReferenceData(NMDP_STD_FIVE_LOCUS_FREQUENCIES));
                    default:
                        break;
                    }
                }
                loadIndividualLocusFrequencies(freq);
                break;
            case WIKIVERSITY:
                for (Linkages linkage : LinkagesLoader.getInstance().getLinkages()) {
                    switch (linkage) {
                    case B_C:
                        this.disequilibriumElementsMap.put(Locus.B_C_LOCI,
                                loadLinkageReferenceData(WIKIVERSITY_BC_FREQUENCIES, BASE_BC_LOCI_POS));
                    case DRB_DQ:
                        this.disequilibriumElementsMap.put(Locus.DRB_DQ_LOCI,
                                loadLinkageReferenceData(WIKIVERSITY_DRDQ_FREQUENCIES, BASE_DRDQ_LOCI_POS));
                    default:
                        break;
                    }
                }
            }
        } catch (IOException | InvalidFormatException ioe) {
            if (Frequencies.NMDP.equals(freq)) {
                LOGGER.warning(
                        "2011 NMDP Frequencies are not included by default.  Please be sure you've loaded them according to the instructions in the README.");
            }
            LOGGER.severe("Couldn't load disequilibrium element reference file.");
            ioe.printStackTrace();

            throw new IOException(ioe);
        }
    }

    public List<DisequilibriumElement> getDisequilibriumElements(EnumSet<Locus> loci) {
        if (this.disequilibriumElementsMap.containsKey(loci)) {
            return this.disequilibriumElementsMap.get(loci);
        }

        return new ArrayList<DisequilibriumElement>();
    }

    public HashMap<Locus, List<String>> getIndividualLocusFrequencies() {
        return individualLocusFrequencies;
    }

    private List<DisequilibriumElement> loadStandardReferenceData(String filename)
            throws IOException, InvalidFormatException {
        InputStream is = HLAFrequenciesLoader.class.getClassLoader().getResourceAsStream(filename);
        InputStreamReader isr = new InputStreamReader(is);
        BufferedReader reader = new BufferedReader(isr);

        String row;
        String[] columns;
        HashMap<String, List<FrequencyByRace>> frequencyMap = new HashMap<String, List<FrequencyByRace>>();

        while ((row = reader.readLine()) != null) {
            columns = row.split(GLStringConstants.COMMA);

            String race = columns[0];
            String haplotype = columns[1];
            Double frequency = new Double(columns[2]);

            List<FrequencyByRace> freqList = frequencyMap.get(haplotype);

            if (freqList == null) {
                freqList = new ArrayList<FrequencyByRace>();
            }

            FrequencyByRace freqByRace = new FrequencyByRace(frequency, null, race);
            freqList.add(freqByRace);

            frequencyMap.put(haplotype, freqList);
        }

        List<DisequilibriumElement> disequilibriumElements = new ArrayList<DisequilibriumElement>();
        DisequilibriumElementByRace disElement;

        for (String haplotype : frequencyMap.keySet()) {
            String[] locusHaplotypes = haplotype.split(GLStringConstants.GENE_PHASE_DELIMITER);

            HashMap<Locus, String> hlaElementMap = new HashMap<Locus, String>();
            for (String locusHaplotype : locusHaplotypes) {
                String[] parts = locusHaplotype.split(GLStringUtilities.ESCAPED_ASTERISK);
                hlaElementMap.put(Locus.lookup(parts[0]), locusHaplotype);
            }
            disElement = new DisequilibriumElementByRace(hlaElementMap, frequencyMap.get(haplotype));

            disequilibriumElements.add(disElement);
        }

        reader.close();

        return disequilibriumElements;
    }

    private List<DisequilibriumElement> loadNMDPLinkageReferenceData(String filename, Locus[] locusPositions)
            throws IOException, InvalidFormatException {
        List<DisequilibriumElement> disequilibriumElements = new ArrayList<DisequilibriumElement>();

        // Finds the workbook instance for XLSX file
        InputStream inStream = HLAFrequenciesLoader.class.getClassLoader().getResourceAsStream(filename);

        if (inStream == null) {
            throw new FileNotFoundException();
        }

        Workbook workbook = WorkbookFactory.create(inStream);

        // Return first sheet from the XLSX workbook
        Sheet mySheet = workbook.getSheetAt(0);

        // Get iterator to all the rows in current sheet
        Iterator<Row> rowIterator = mySheet.iterator();

        int firstRow = mySheet.getFirstRowNum();

        List<String> raceHeaders = null;

        // Traversing over each row of XLSX file
        while (rowIterator.hasNext()) {
            Row row = rowIterator.next();

            if (row.getRowNum() == firstRow) {
                raceHeaders = readHeaderElementsByRace(row);
            } else {
                disequilibriumElements.add(readDiseqilibriumElementsByRace(row, raceHeaders, locusPositions));
            }
        }

        workbook.close();

        return disequilibriumElements;
    }

    private void loadIndividualLocusFrequencies(Frequencies freq) throws IOException, InvalidFormatException {
        for (Linkages linkage : LinkagesLoader.getInstance().getLinkages()) {
            for (Locus locus : linkage.getLoci()) {
                if (locus.hasIndividualFrequencies()) {
                    loadIndividualLocusFrequency(freq, locus);
                }
            }
        }
    }

    private void loadIndividualLocusFrequency(Frequencies freq, Locus locus)
            throws IOException, InvalidFormatException {
        List<String> singleLocusFrequencies = new ArrayList<String>();
        String extension = freq.equals(Frequencies.NMDP) ? ".xlsx" : ".xls";
        InputStream inputStream = HLAFrequenciesLoader.class.getClassLoader().getResourceAsStream(
                "frequencies/" + freq.getShortName() + "/" + locus.getFrequencyName() + extension);

        if (inputStream == null)
            return;

        Workbook workbook = WorkbookFactory.create(inputStream);

        // Return first sheet from the XLSX workbook
        Sheet mySheet = workbook.getSheetAt(0);

        // Get iterator to all the rows in current sheet
        Iterator<Row> rowIterator = mySheet.iterator();

        int firstRow = mySheet.getFirstRowNum();

        String cellValue = null;

        // Traversing over each row of XLSX file
        while (rowIterator.hasNext()) {
            Row row = rowIterator.next();

            if (row.getRowNum() == firstRow) {
                continue;
            } else {
                cellValue = row.getCell(0).getStringCellValue();
                if (!cellValue.contains(GLStringConstants.ASTERISK)) {
                    cellValue = locus.getShortName() + GLStringConstants.ASTERISK + cellValue.substring(0, 2)
                            + GLStringUtilities.COLON + cellValue.substring(2);
                }
                singleLocusFrequencies.add(GLStringConstants.HLA_DASH + cellValue);
            }
        }

        individualLocusFrequencies.put(locus, singleLocusFrequencies);

        workbook.close();
    }

    private List<String> readHeaderElementsByRace(Row row) {
        List<String> raceHeaders = new ArrayList<String>();

        Iterator<Cell> cellIterator = row.cellIterator();

        while (cellIterator.hasNext()) {
            Cell cell = cellIterator.next();

            String[] race = cell.getStringCellValue().split(UNDERSCORE);
            raceHeaders.add(cell.getColumnIndex(), race[0]);
        }

        return raceHeaders;
    }

    /**
     * @param row
     */
    private DisequilibriumElement readDiseqilibriumElementsByRace(Row row, List<String> raceHeaders,
            Locus[] locusPositions) {
        // For each row, iterate through each columns
        Iterator<Cell> cellIterator = row.cellIterator();

        List<FrequencyByRace> frequenciesByRace = new ArrayList<FrequencyByRace>();
        DisequilibriumElementByRace disElement = new DisequilibriumElementByRace();

        int columnIndex;
        String cellValue = null;

        while (cellIterator.hasNext()) {
            Cell cell = cellIterator.next();

            columnIndex = cell.getColumnIndex();

            if (columnIndex < locusPositions.length) {
                cellValue = cell.getStringCellValue();
                if (!cellValue.contains(GLStringConstants.ASTERISK)) {
                    cellValue = locusPositions[columnIndex].getShortName() + GLStringConstants.ASTERISK
                            + cellValue.substring(0, 2) + GLStringUtilities.COLON + cellValue.substring(2);
                }
                disElement.setHlaElement(locusPositions[columnIndex], GLStringConstants.HLA_DASH + cellValue);
            } else {
                if ((locusPositions.length % 2 == 0 && columnIndex % 2 == 0)
                        || (locusPositions.length % 2 != 0 && columnIndex % 2 != 0)) {
                    disElement
                            .setFrequenciesByRace(loadFrequencyAndRank(row, cell, frequenciesByRace, raceHeaders));
                }
            }
        }

        return disElement;
    }

    /**
     * @param row
     * @param frequenciesByRace
     * @param cell
     * @param idx
     */
    private List<FrequencyByRace> loadFrequencyAndRank(Row row, Cell cell, List<FrequencyByRace> frequenciesByRace,
            List<String> raceHeaders) {
        Double freq = cell.getNumericCellValue();

        if (freq != 0) {
            FrequencyByRace frequencyByRace = new FrequencyByRace(freq,
                    ((Double) row.getCell(cell.getColumnIndex() + 1).getNumericCellValue()).toString(),
                    raceHeaders.get(cell.getColumnIndex()));
            frequenciesByRace.add(frequencyByRace);
        }

        Collections.sort(frequenciesByRace, new FrequencyByRaceComparator());

        return frequenciesByRace;
    }

    private List<DisequilibriumElement> loadLinkageReferenceData(String filename, Locus[] locusPositions)
            throws FileNotFoundException, IOException {
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(HLAFrequenciesLoader.class.getClassLoader().getResourceAsStream(filename)));
        String row;
        String[] columns;
        HashMap<Locus, String> hlaElementMap;
        List<DisequilibriumElement> disequilibriumElements = new ArrayList<DisequilibriumElement>();

        while ((row = reader.readLine()) != null) {
            hlaElementMap = new HashMap<Locus, String>();

            columns = row.split(GLStringConstants.TAB);

            for (int i = 0; i < locusPositions.length; i++) {
                hlaElementMap.put(locusPositions[i], columns[i]);
            }

            disequilibriumElements.add(new BaseDisequilibriumElement(hlaElementMap, columns[locusPositions.length],
                    columns[locusPositions.length + 1]));
        }

        reader.close();

        return disequilibriumElements;
    }
}