chibi.gemmaanalysis.LinkMatrix.java Source code

Java tutorial

Introduction

Here is the source code for chibi.gemmaanalysis.LinkMatrix.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2007 Columbia University
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package chibi.gemmaanalysis;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Vector;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.basecode.dataStructure.matrix.CompressedBitMatrix;
import ubic.basecode.ontology.model.OntologyTerm;
import ubic.gemma.expression.experiment.service.ExpressionExperimentService;
import ubic.gemma.genome.gene.service.GeneService;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;
import ubic.gemma.model.genome.Gene;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.ontology.providers.GeneOntologyService;
import cern.colt.list.ObjectArrayList;

/**
 * WARNING probably broken. Creates an efficient matrix holding information about gene links, and can answer some basic
 * questions about genes in the matrix. The matrix also knows how to index elements using gene pairs represented by two
 * values packed into a single bit array. That representation is used in classes storing links in trees.
 * <p>
 * FIXME this class mixes data structure and data access, along with 'utility' methods in a way that's a bit unclean
 * (this used be called LinkMatrixUtil).
 * 
 * @author xwan
 * @version $Id$
 * @see FrequentLinkSetFinder
 */
public class LinkMatrix {

    protected static final Log log = LogFactory.getLog(LinkMatrix.class);

    /**
     * @param mask1
     * @param mask2
     * @return
     */
    public static long[] AND(long[] mask1, long[] mask2) {
        long res[] = new long[mask1.length];
        for (int i = 0; i < mask1.length; i++)
            res[i] = mask1[i] & mask2[i];
        return res;
    }

    /**
     * @param mask
     * @param index
     * @return
     */
    public static boolean checkBits(long[] mask, int index) {
        int num = index / CompressedBitMatrix.BITS_PER_ELEMENT;
        int bit_index = index % CompressedBitMatrix.BITS_PER_ELEMENT;
        long res = mask[num] & CompressedBitMatrix.BIT1 << bit_index;
        if (res == 0)
            return false;
        return true;
    }

    public static boolean compare(long[] mask1, long mask2[]) {
        for (int i = 0; i < mask1.length; i++)
            if (mask1[i] != mask2[i])
                return false;
        return true;
    }

    /**
     * @param mask
     * @return
     */
    public static int countBits(long[] mask) {
        int bits = 0;
        for (int i = 0; i < mask.length; i++)
            bits = bits + Long.bitCount(mask[i]);
        return bits;
    }

    /**
     * @param mask1
     * @param mask2
     * @return
     */
    public static long[] OR(long[] mask1, long[] mask2) {
        long res[] = new long[mask1.length];
        for (int i = 0; i < mask1.length; i++)
            res[i] = mask1[i] | mask2[i];
        return res;
    }

    /**
     * @param mask1
     * @param mask2
     * @return
     */
    public static int overlapBits(long[] mask1, long[] mask2) {
        int bits = 0;
        for (int i = 0; i < mask1.length; i++)
            bits = bits + Long.bitCount(mask1[i] & mask2[i]);
        return bits;
    }

    /**
     * This value is adjusted when necessary.
     */
    private int shift = 50000; // to encode two geneid into one long id

    private CompressedBitMatrix<Long, Long> linkCountMatrix = null;

    private Map<Long, Integer> eeIndexMap = null;
    private Map<Integer, ExpressionExperiment> eeMap = null;

    private Map<Long, Gene> geneMap = null;

    private Collection<Gene> targetGenes = null;

    private GeneService geneService = null;

    private ExpressionExperimentService eeService = null;

    private int stringency = 2;

    private GeneOntologyService goService;

    /**
     * @param genes
     */
    public LinkMatrix(Collection<Gene> genes) {
        init(genes);
    }

    /**
     * Create from a matrix stored in a file.
     * 
     * @param matrixFile
     * @param eeMapFile
     * @param eeService
     * @param geneService
     * @throws IOException
     */
    public LinkMatrix(String matrixFile, String eeMapFile, ExpressionExperimentService eeService,
            GeneService geneService, GeneOntologyService goService) throws IOException {
        this.goService = goService;
        try (BufferedReader in = new BufferedReader(new FileReader(new File(matrixFile)));) {
            String row = null;
            int i;
            boolean hasConfig = false, hasRowNames = false, hasColNames = false;
            Collection<Long> geneIds = new HashSet<Long>();
            while ((row = in.readLine()) != null) {
                row = row.trim();
                if (StringUtils.isBlank(row))
                    continue;
                String[] subItems = row.split("\t");
                for (i = 0; i < subItems.length; i++)
                    if (StringUtils.isBlank(subItems[i]))
                        break;
                if (i != subItems.length) {
                    String mesg = "The empty Element is not allowed: " + row;
                    log.info(mesg);
                    in.close();
                    throw new IOException(mesg);
                }
                if (!hasConfig) {
                    if (subItems.length != 3) {
                        String mesg = "Data File Format Error for configuration " + row;
                        log.info(mesg);
                        throw new IOException(mesg);
                    }
                    linkCountMatrix = new CompressedBitMatrix<Long, Long>(Integer.valueOf(subItems[0]),
                            Integer.valueOf(subItems[1]), Integer.valueOf(subItems[2]));
                    hasConfig = true;
                } else if (!hasRowNames) {
                    if (subItems.length != linkCountMatrix.rows()) {
                        String mesg = "Data File Format Error for Row Names " + row;
                        log.info(mesg);
                        throw new IOException(mesg);
                    }
                    for (i = 0; i < subItems.length; i++) {
                        linkCountMatrix.addRowName(new Long(subItems[i].trim()));
                        geneIds.add(new Long(subItems[i].trim()));
                    }
                    hasRowNames = true;
                } else if (!hasColNames) {
                    if (subItems.length != linkCountMatrix.columns()) {
                        String mesg = "Data File Format Error for Col Names " + row;
                        log.info(mesg);
                        throw new IOException(mesg);
                    }
                    for (i = 0; i < subItems.length; i++) {
                        linkCountMatrix.addColumnName(new Long(subItems[i].trim()));
                        geneIds.add(new Long(subItems[i].trim()));
                    }
                    hasColNames = true;
                } else {
                    int rowIndex = Integer.valueOf(subItems[0]);
                    int colIndex = Integer.valueOf(subItems[1]);
                    double values[] = new double[subItems.length - 2];
                    for (i = 2; i < subItems.length; i++)
                        values[i - 2] = Double.longBitsToDouble(Long.parseLong(subItems[i], 16));
                    linkCountMatrix.set(rowIndex, colIndex, values);
                }
            }

            Collection<Gene> allGenes = geneService.loadMultiple(geneIds);
            for (Gene gene : allGenes) {
                geneMap.put(gene.getId(), gene);
            }
            if (eeMapFile != null) {
                try (BufferedReader in2 = new BufferedReader(new FileReader(new File(eeMapFile)));) {
                    this.eeIndexMap = new HashMap<Long, Integer>();
                    int vectorSize = 0;
                    while ((row = in2.readLine()) != null) {
                        row = row.trim();
                        if (StringUtils.isBlank(row))
                            continue;
                        String[] subItems = row.split("\t");
                        if (subItems.length != 2)
                            continue;
                        for (i = 0; i < subItems.length; i++)
                            if (StringUtils.isBlank(subItems[i]))
                                break;
                        if (i != subItems.length) {
                            String mesg = "Data File Format Error for ee Map " + row;
                            log.info(mesg);
                            in2.close();
                            throw new IOException(mesg);
                        }
                        this.eeIndexMap.put(new Long(subItems[0].trim()), new Integer(subItems[1].trim()));
                        if (Integer.valueOf(subItems[1].trim()).intValue() > vectorSize)
                            vectorSize = Integer.valueOf(subItems[1].trim()).intValue();
                    }
                    eeMap = new HashMap<Integer, ExpressionExperiment>();
                    for (Long iter : this.eeIndexMap.keySet()) {
                        ExpressionExperiment ee = eeService.load(iter);
                        eeMap.put(this.eeIndexMap.get(iter), ee);
                    }
                    log.info("Got " + this.eeIndexMap.size() + " in EE MAP");

                }
            }
            computeShift();
        }
    }

    /**
     * Initialize with all genes for the given taxon.
     * 
     * @param taxon
     */
    public LinkMatrix(Taxon taxon) {
        Collection<Gene> allGenes = geneService.getGenesByTaxon(taxon);

        log.info("Got " + allGenes.size() + " genes");
        if (allGenes.size() == 0)
            return;
        init(allGenes);
    }

    /**
     * @param id
     * @param eeIndex
     * @return
     */
    public boolean checkEEConfirmation(long id, int eeIndex) {
        int rows = (int) (id / shift);
        int cols = (int) (id % shift);
        return linkCountMatrix.get(rows, cols, eeIndex);
    }

    /**
     * @param treeIds
     * @param rank the number of top ranked GO terms to return
     * @return
     */
    public Map<OntologyTerm, Integer> computeGOOverlap(Collection<Long> treeIds, int rank) {
        Collection<Gene> genes = new HashSet<Gene>();
        for (Long treeId : treeIds) {
            int row = (int) (treeId / this.shift);
            int col = (int) (treeId % this.shift);
            genes.add(getRowGene(row));
            genes.add(getColGene(col));
        }
        Map<OntologyTerm, Integer> res = new HashMap<OntologyTerm, Integer>();
        ObjectArrayList counter = new ObjectArrayList(rank);
        for (int i = 0; i < counter.size(); i++)
            counter.add(new Integer(0));
        for (Gene gene : genes) {
            Collection<OntologyTerm> goEntries = goService.getGOTerms(gene);
            for (OntologyTerm goEntry : goEntries) {
                Integer goNum = new Integer(1);
                if (res.containsKey(goEntry)) {
                    goNum = res.get(goEntry);
                    goNum = goNum + 1;
                    res.put(goEntry, goNum);
                } else {
                    res.put(goEntry, goNum);
                }
            }
        }
        if (rank >= res.keySet().size())
            return res;
        for (OntologyTerm ontologyTerm : res.keySet()) {
            Integer goNum = res.get(ontologyTerm);
            counter.add(goNum);
        }
        counter.sort();
        Integer threshold = (Integer) counter.get(counter.size() - rank);
        Collection<OntologyTerm> removed = new HashSet<OntologyTerm>();
        for (OntologyTerm ontologyTerm : res.keySet()) {
            Integer goNum = res.get(ontologyTerm);
            if (goNum < threshold)
                removed.add(ontologyTerm);
        }
        for (OntologyTerm ontologyTerm : removed)
            res.remove(ontologyTerm);
        return res;
    }

    /**
     * @param packedId
     * @return
     */
    public int computeGOOverlap(long packedId) {
        int row = (int) (packedId / shift);
        int col = (int) (packedId % shift);
        return goService.calculateGoTermOverlap(getRowGene(row), getColGene(col)).size();
    }

    /**
     * @param row
     * @param col
     * @return
     */
    public long generateId(int row, int col) {
        return (long) row * (long) shift + col;
    }

    /**
     * @param i
     * @return
     */
    public Gene getColGene(int i) {
        Object geneId = linkCountMatrix.getColName(i);
        return geneMap.get(geneId);
    }

    /**
     * @param i
     * @return
     */
    public ExpressionExperiment getEE(int i) {
        return eeMap.get(new Integer(i));
    }

    /**
     * @param eeId
     * @return
     */
    public int getEEIndex(long eeId) {
        Integer eeIndex = eeIndexMap.get(eeId);
        if (eeIndex == null)
            return -1;
        return eeIndex;
    }

    /**
     * @param i
     * @return
     */
    public String getEEName(int i) {
        return getEE(i).getShortName();
    }

    /**
     * @param mask
     * @return
     */
    public Collection<String> getEENames(long[] mask) {
        Set<String> returnedSet = new HashSet<String>();
        for (int i = 0; i < mask.length; i++) {
            for (int j = 0; j < CompressedBitMatrix.BITS_PER_ELEMENT; j++)
                if ((mask[i] & (CompressedBitMatrix.BIT1 << j)) != 0) {
                    returnedSet.add(getEEName(j + i * CompressedBitMatrix.BITS_PER_ELEMENT));
                }
        }
        return returnedSet;
    }

    /**
     * @param gene
     * @return
     */
    public Collection<OntologyTerm> getGOTerms(Gene gene) {
        return this.goService.getGOTerms(gene);
    }

    /**
     * @param id
     * @return A string representing 1) the genes and 2) their GO term overlap.
     */
    public String getLinkName(long id) {
        Gene[] pairOfGenes = getPairedGenes(id);
        assert pairOfGenes.length == 2;
        return pairOfGenes[0].getName() + "_" + pairOfGenes[1].getName() + "_"
                + goService.calculateGoTermOverlap(pairOfGenes[0], pairOfGenes[1]).size();
    }

    /**
     * @param packedId A bit array storing two gene ids, with one stored in the upper bits and the other in the lower
     *        bits (offset is defined by this.shift).
     * @return Array containing two genes.
     */
    public Gene[] getPairedGenes(long packedId) {
        Gene[] pairedGene = new Gene[2];
        int row = (int) (packedId / shift);
        int col = (int) (packedId % shift);
        pairedGene[0] = getRowGene(row);
        pairedGene[1] = getColGene(col);
        return pairedGene;
    }

    public CompressedBitMatrix<Long, Long> getRawMatrix() {
        return this.linkCountMatrix;
    }

    /**
     * @param i
     * @return
     */
    public Gene getRowGene(int i) {
        Object geneId = linkCountMatrix.getRowName(i);
        return geneMap.get(geneId);
    }

    /**
     * @param ees
     * @param targetGenes
     * @param coExpressedGenes
     */
    public void init(Collection<ExpressionExperiment> ees, Collection<Gene> t, Collection<Gene> coExpressedGenes) {
        CompressedBitMatrix<Long, Long> linkCount = new CompressedBitMatrix<Long, Long>(targetGenes.size(),
                coExpressedGenes.size(), ees.size());
        for (Gene geneIter : targetGenes) {
            linkCount.addRowName(geneIter.getId());
        }
        for (Gene geneIter : coExpressedGenes) {
            linkCount.addColumnName(geneIter.getId());
        }
        eeIndexMap = new HashMap<Long, Integer>();
        eeMap = new HashMap<Integer, ExpressionExperiment>();
        int index = 0;
        for (ExpressionExperiment eeIter : ees) {
            eeIndexMap.put(eeIter.getId(), new Integer(index));
            eeMap.put(new Integer(index), eeIter);
            index++;
        }
        geneMap = new HashMap<Long, Gene>();
        for (Gene gene : targetGenes) {
            geneMap.put(gene.getId(), gene);
        }
        if (targetGenes != coExpressedGenes) {
            for (Gene gene : coExpressedGenes) {
                geneMap.put(gene.getId(), gene);
            }
        }
        this.targetGenes = t;
    }

    /**
     * Output the gene
     * 
     * @param gene
     * @param num
     */
    public void output(Gene gene, int num) {
        int row = this.linkCountMatrix.getRowIndexByName(gene.getId());
        if (row < 0 || row >= this.linkCountMatrix.rows()) {
            log.info("Gene does not exist");
            return;
        }
        for (int col = 0; col < this.linkCountMatrix.columns(); col++)
            if (this.linkCountMatrix.bitCount(row, col) >= num) {
                System.err.println(getColGene(col).getName() + " " + this.linkCountMatrix.bitCount(row, col));
            }
        System.err.println("=====================================================");
        for (int col = 0; col < this.linkCountMatrix.columns(); col++)
            if (this.linkCountMatrix.bitCount(row, col) >= num) {
                System.err.println(getColGene(col).getName());
            }
    }

    /**
     * @param num
     */
    public void output(int num) {
        int count = 0;
        for (int i = 0; i < this.linkCountMatrix.rows(); i++) {
            for (int j = 0; j < this.linkCountMatrix.columns(); j++) {
                if (this.linkCountMatrix.bitCount(i, j) >= num) {
                    System.err.println(getRowGene(i).getName() + "  " + getColGene(j).getName() + " "
                            + this.linkCountMatrix.bitCount(i, j));
                    count++;
                }
            }
        }
        System.err.println("Total Links " + count);
    }

    /**
     * 
     */
    public void outputStat() {
        int maxNum = 50;
        Vector<Integer> count = new Vector<Integer>(maxNum);
        for (int i = 0; i < maxNum; i++)
            count.add(0);
        for (int i = 0; i < this.linkCountMatrix.rows(); i++) {
            for (int j = i + 1; j < this.linkCountMatrix.columns(); j++) {
                int num = this.linkCountMatrix.bitCount(i, j);
                if (num == 0)
                    continue;
                if (num > maxNum) {
                    for (; maxNum < num; maxNum++)
                        count.add(0);
                }
                Integer tmpno = count.elementAt(num - 1);
                tmpno = tmpno + 1;
                count.setElementAt(tmpno, num - 1);
            }
        }
        for (int i = 0; i < count.size(); i++) {
            System.err.print(i + "[" + count.elementAt(i) + "] ");
            if (i % 10 == 0)
                System.err.println("");
        }
    }

    /**
     * Output is a file with the row and column indices and support for each link in the matrix (subject to the
     * stringency)
     * 
     * @param outFile path to file to save to
     */
    public void saveLinkMatrix(String outFile) {
        try {
            ObjectArrayList nodes = new ObjectArrayList();

            FileWriter out = new FileWriter(new File(outFile));
            for (int i = 0; i < linkCountMatrix.rows(); i++) {
                if (i % 1000 == 0)
                    System.err.println(i + " -> " + linkCountMatrix.rows());
                for (int j = i + 1; j < linkCountMatrix.columns(); j++) {
                    if (linkCountMatrix.bitCount(i, j) >= stringency) {
                        TreeNode oneNode = new TreeNode(generateId(i, j), linkCountMatrix.getAllBits(i, j), null);
                        nodes.add(oneNode);
                    }
                }
            }
            for (int rowIndex = 0; rowIndex < nodes.size(); rowIndex++) {
                TreeNode rowNode = (TreeNode) nodes.get(rowIndex);
                for (int colIndex = rowIndex + 1; colIndex < nodes.size(); colIndex++) {
                    TreeNode colNode = (TreeNode) nodes.get(colIndex);
                    int commonBits = overlapBits(rowNode.getMask(), colNode.getMask());
                    if (commonBits >= stringency) {
                        out.write(rowIndex + "\t" + colIndex + "\t" + commonBits + "\n");
                        out.write(colIndex + "\t" + rowIndex + "\t" + commonBits + "\n");
                    }
                }

            }
            out.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void setEEService(ExpressionExperimentService eeService) {
        this.eeService = eeService;
    }

    public void setGeneService(GeneService geneService) {
        this.geneService = geneService;
    }

    public void setGoService(GeneOntologyService goService) {
        this.goService = goService;
    }

    public void setStringency(int stringency) {
        this.stringency = stringency;
    }

    public void toFile(String matrixFile, String eeMapFile) throws IOException {
        linkCountMatrix.toFile(matrixFile);
        FileWriter out = new FileWriter(new File(eeMapFile));
        for (Long index : this.eeIndexMap.keySet()) {
            out.write(index + "\t" + this.eeIndexMap.get(index) + "\n");
        }
        out.close();
    }

    /**
     * For test purposes only.
     * 
     * @param i
     * @param j
     * @return
     */
    protected boolean filter(int i, int j) {
        // String geneName1 = getRowGene( i ).getName();
        // String geneName2 = getColGene( j ).getName();
        // if(geneName1.matches("(RPL|RPS)(.*)") ||geneName2.matches("(RPL|RPS)(.*)"))
        // return true;
        return false;
    }

    /**
     * Ensure the shift is high enough to store genes in pairs in a single 64-bit vector.
     */
    private void computeShift() {
        shift = linkCountMatrix.rows() > linkCountMatrix.columns() ? linkCountMatrix.rows()
                : linkCountMatrix.columns();
    }

    /**
     * @param genes
     */
    private void init(Collection<Gene> genes) {
        if (genes == null || genes.size() == 0)
            return;
        Taxon taxon = genes.iterator().next().getTaxon();
        Collection<ExpressionExperiment> ees = eeService.findByTaxon(taxon);
        if (ees == null || ees.size() == 0)
            return;
        init(genes, ees);
    }

    /**
     * @param genes
     * @param ees
     */
    private void init(Collection<Gene> genes, Collection<ExpressionExperiment> ees) {
        if (genes == null || ees == null || genes.size() == 0 || ees.size() == 0)
            return;
        Collection<Gene> genesInTaxon = geneService.getGenesByTaxon(genes.iterator().next().getTaxon());
        if (genesInTaxon == null || genesInTaxon.size() == 0)
            return;

        init(ees, genes, genesInTaxon);
    }
}