ubic.BAMSandAllen.RankedGeneListLoader.java Source code

Java tutorial

Introduction

Here is the source code for ubic.BAMSandAllen.RankedGeneListLoader.java

Source

/*******************************************************************************
 * The ABAMS project
 * 
 * Copyright (c) 2012 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *       http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package ubic.BAMSandAllen;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.BAMSandAllen.AllenDataLoaders.AllenCatalogMatrices2;
import ubic.BAMSandAllen.AllenDataLoaders.ImageSeriesInfoLoader;
import ubic.BAMSandAllen.AnalyzeBAMSandAllenGenes.Direction;
import ubic.BAMSandAllen.FocusedAnalysis.ExploreRegionNames;
import ubic.BAMSandAllen.MatrixPairs.ConnectivityAndAllenExpressionMatrixPair;
import ubic.BAMSandAllen.MatrixPairs.ExpressionMatrixPairFactory;
import ubic.BAMSandAllen.MatrixPairs.MatrixPair;
import ubic.BAMSandAllen.geneFilters.PlaneRemoveFilter.Plane;
import ubic.basecode.dataStructure.StringToStringSetMap;
import ubic.basecode.dataStructure.matrix.DoubleMatrix;
import ubic.basecode.util.FileTools;
import au.com.bytecode.opencsv.CSVReader;

public class RankedGeneListLoader {
    private static Log log = LogFactory.getLog(RankedGeneListLoader.class.getName());
    static final int CORRELATION_TREND = 5;
    List<String> lines;
    String filename;
    Double threshold;

    public List<String> getLines() {
        return lines;
    }

    public void addToFront(List<String> addIn) {
        lines.addAll(0, addIn);
    }

    public RankedGeneListLoader(String filename) throws Exception {
        this(filename, true);
    }

    public RankedGeneListLoader(String filename, boolean reverse) throws Exception {
        this.filename = filename;
        lines = FileTools.getLines(filename);
        log.info("Lines read:" + lines.size() + " " + filename);
        // chomp the last one if it's blank
        if (lines.get(lines.size() - 1).equals("")) {
            lines.remove(lines.size() - 1);
        }
        // assume the last is the first ranked gene
        if (reverse)
            reverse();
        threshold = null;
    }

    public RankedGeneListLoader(List<String> lines, String filename) throws Exception {
        this.filename = filename;
        this.lines = lines;
        threshold = null;
    }

    /**
     * Writes information by using the matrix pair - a list of top genes and a the correlation after their removal
     * 
     * @param pair
     * @return
     * @throws Exception
     */
    public String writeCorrelationBasedData(MatrixPair pair) throws Exception {
        return writeCorrelationBasedData(pair, true);
    }

    public String getResultFilename() {
        String result = filename.replace("LOOGenesInOrder", "LOOResults");
        return result;
    }

    public String getCorrelationLevelPoint() throws Exception {
        return writeEndSetGenes(true);
    }

    public String writeEndSetGenes(boolean keepSign) throws Exception {
        List<String> topGenes = new LinkedList<String>();
        final int correlation_column = 1;
        final int gene_column = 2;
        String resultFilename = getResultFilename();
        CSVReader reader = new CSVReader(new FileReader(resultFilename));
        List<String[]> correlationLines = reader.readAll();
        double firstBaseLine = Double.parseDouble(correlationLines.get(0)[correlation_column]);

        boolean increase = firstBaseLine > 0;
        if (!keepSign)
            increase = !increase;

        List<String> rowsInFile = new LinkedList<String>();

        double apexCorrelation;

        if (increase)
            apexCorrelation = Double.MIN_VALUE;
        else
            apexCorrelation = Double.MAX_VALUE;

        // find the apex
        for (String[] line : correlationLines) {
            double correlation = Double.parseDouble(line[correlation_column]);
            if (increase && (apexCorrelation < correlation)) {
                apexCorrelation = correlation;
            }
            if (!increase && (apexCorrelation > correlation))
                apexCorrelation = correlation;
        }
        log.info("Peak correlation:" + apexCorrelation);

        int count = 0;
        boolean reachedApex = false;
        // get genes after the apex, could use index instead
        for (String[] line : correlationLines) {
            count++;
            String row = line[gene_column];
            rowsInFile.add(row);
            double correlation = Double.parseDouble(line[correlation_column]);

            if (correlation == apexCorrelation)
                reachedApex = true;

            if (reachedApex) {
                topGenes.add(row);
            }
        }

        // the results file maybe missing genes at the end, add them in
        List<String> addToEnd = new LinkedList<String>(lines);
        addToEnd.removeAll(rowsInFile);
        log.info("adding " + addToEnd.size() + " genes to end");
        topGenes.addAll(addToEnd);

        threshold = ((double) topGenes.size()) / lines.size();
        // round to 5 decimals
        threshold = Double.parseDouble(String.format("%.5g%n", threshold));

        String outFileName = filename + "." + topGenes.size() + "." + threshold + ".topGenes.txt";
        FileTools.stringsToFile(topGenes, new File(outFileName));
        log.info("Wrote file: " + outFileName);

        return outFileName;
    }

    public void convertToNCBIIDs() throws Exception {
        ImageSeriesInfoLoader infoLoader = new ImageSeriesInfoLoader();
        List<String> IDs = new LinkedList<String>();
        int nullNCBI = 0;
        for (String geneRowName : lines) {
            Integer ncbiID = infoLoader.getNCBIIDFromRowName(geneRowName);
            if (ncbiID != null) {
                IDs.add("" + ncbiID);
            } else {
                nullNCBI++;
            }
        }
        log.info("Null NCBI genes:" + nullNCBI);
        lines = IDs;
    }

    @Deprecated
    public String writeCorrelationBasedData(MatrixPair pair, boolean keepSign) throws Exception {
        // compute correlations
        // create pair? erg, no arguments
        // get correlation numbers
        reverse();

        double firstBaseLine = pair.getCorrelation(true);
        // make it more negative if it starts below zero
        boolean increase = firstBaseLine > 0;

        // if we are going against the current correlation sign - eg go from positive correlation to negative
        if (!keepSign) {
            increase = !increase;
        }

        // String result;// = "NumberRemoved\tCorrelation\n"+

        String newLine = System.getProperty("line.separator");
        String result = "";
        List<String> topGenes = new LinkedList<String>();

        // only work with the lines we have
        pair.setMatrixBDataRows(lines);
        long startTime = System.currentTimeMillis();

        result += "0\t" + pair.getCorrelation(true) + "\t" + "AllGenes" + newLine;
        double lastCorrelation = pair.getCorrelation(true);
        int count = 0;
        for (String row : lines) {
            count++;
            pair.removeMatrixBDataRowFast(row);
            double correlation = pair.getCorrelation(true);
            result += count + "\t" + correlation + "\t" + row + newLine;
            if (count % 1000 == 0)
                log.info("Removing:" + row + " Correlation after:" + correlation + " Removed:" + count);

            FileTools.stringToFile(count + "," + correlation + "," + row + "\n",
                    new File(SetupParameters.getDataFolder() + "LOOResults." + startTime + ".txt"), true);

            // if we are no longer gaining correlation then start keeping track
            // boolean decreasingCurrent = lastCorrelation - correlation > 0;

            // if we have more than four already, consider it a trend
            // if ( ( decreasingCurrent && increase ) || ( !decreasingCurrent && !increase )
            // || topGenes.size() > CORRELATION_TREND ) {
            // // if the non expressing genes were added to the end of the list then don't add this gene
            // // if ( count > 20000 ) {
            // topGenes.add( row );
            // log.info( "Top gene add " + count );
            // // }
            // } else {
            // // require a decrease for four genes in a row to start topGene list
            // if ( topGenes.size() != 0 && topGenes.size() < CORRELATION_TREND ) {
            // topGenes.clear();
            // log.info( "Top genes clear" );
            // }
            // }
            // lastCorrelation = correlation;
        }

        // // put it back in order
        // reverse();
        //
        // // make it so top gene is top
        // Collections.reverse( topGenes );
        //
        // FileTools.stringToFile( result, new File( filename + ".forPlotting" ) );
        // log.info( "Wrote file: " + filename + ".forPlotting" );
        //
        // FileTools.stringsToFile( topGenes, new File( filename + ".topGenes.txt" ) );
        // log.info( "Wrote file: " + filename + "." + topGenes.size() + ".topGenes.txt" );

        // RankedGeneListLoader tempLoader = new RankedGeneListLoader( topGenes, filename + "." + topGenes.size()
        // + ".topGenes" );
        // tempLoader.forErmineJ();

        return result;
    }

    public List<String> addMissing(boolean top) throws Exception {
        List<String> originalRows = getOriginalRows();

        List<String> missing = new LinkedList<String>(originalRows);
        missing.removeAll(lines);
        // consistnent shuffling for now
        Collections.shuffle(missing, new Random(1));
        log.info("Missing size:" + missing.size());
        // log.info( "Missing:" + missing );
        // add to start of list
        for (String missed : missing) {
            if (top)
                lines.add(0, missed);
            else
                lines.add(missed);
        }
        return missing;
    }

    public List<String> addMissingToBottom() throws Exception {
        return addMissing(false);
    }

    public RankedGeneListLoader getShiftedList(RankedGeneListLoader list2) throws Exception {
        Map<String, Integer> shifts = new HashMap<String, Integer>();
        for (int i = 0; i < lines.size(); i++) {
            String gene = lines.get(i);
            int j = list2.lines.indexOf(gene);
            int shift = i - j;
            shifts.put(gene, shift);
        }

        List<Map.Entry<String, Integer>> list = new LinkedList(shifts.entrySet());
        Collections.sort(list, new Comparator() {
            public int compare(Object o1, Object o2) {
                return ((Comparable) ((Map.Entry) (o1)).getValue()).compareTo(((Map.Entry) (o2)).getValue());
            }
        });

        List<String> result = new LinkedList<String>();
        for (Map.Entry<String, Integer> entry : list) {
            // log.info( entry.getKey() + "->" + entry.getValue() );
            result.add(entry.getKey());
        }
        return new RankedGeneListLoader(result, filename + ".shifted");
    }

    public int getRank(String gene) {
        return lines.indexOf(gene);
    }

    public void reverse() {
        Collections.reverse(lines);
    }

    public void removeRikStars() {
        List<String> result = new LinkedList<String>();
        int stars = 0;
        for (String line : lines) {
            if (line.matches(".*Rik[*].*")) {
                line = line.replaceAll("Rik[*]", "Rik");
                stars++;
            }
            result.add(line);
        }
        log.info("Stars removed:" + stars);
        lines = result;
    }

    public void removeImageIDs() {
        List<String> result = new LinkedList<String>();
        for (String line : lines) {
            line = line.replaceAll("\\[.*\\]", "");
            result.add(line);
        }
        lines = result;
    }

    public String getTabSepRanks() {
        String result = "gene\tscore\n";
        double rank = 0;
        for (String line : lines) {
            double scale = rank / lines.size();
            result += line + "\t" + scale + System.getProperty("line.separator");
            rank++;
        }
        return result;
    }

    public List<String> getRows() {
        return lines;
    }

    public int size() {
        return lines.size();
    }

    public String forErmineJ() throws Exception {
        removeRikStars();
        removeImageIDs();
        return writeOut();
    }

    public String forErmineJNCBI() throws Exception {
        convertToNCBIIDs();
        return writeOut("NCBI");
    }

    public String writeOut() throws Exception {
        return writeOut("symbols");
    }

    public String writeOut(String endfix) throws Exception {
        log.info("Size for writing:" + size());
        String out_filename = filename + "." + endfix + ".forErmine";
        FileTools.stringToFile(getTabSepRanks(), new File(out_filename));
        log.info("Wrote file: " + out_filename);
        return out_filename;
    }

    // public void makeChartData( MatrixPair pair ) throws Exception {
    // FileTools.stringToFile( getTabSepChartData( pair ), new File( filename + ".forPlotting" ) );
    // log.info( "Wrote file: " + filename + ".forPlotting" );
    // }

    public static List<String> getOriginalRows() throws Exception {
        AllenCatalogMatrices2 allenMatrices = new AllenCatalogMatrices2();
        DoubleMatrix<String, String> dataMatrix = allenMatrices.getFromDisk("NewEnergies");
        return dataMatrix.getRowNames();
    }

    public static void processDirectoryNoMissing(String directory, ConnectivityAndAllenExpressionMatrixPair pair)
            throws Exception {
        processDirectoryNoMissing(directory, pair, true);
    }

    public static void processDirectoryNoMissing(String directory, ConnectivityAndAllenExpressionMatrixPair pair,
            boolean keepSign) throws Exception {
        Collection<File> files = FileTools.listDirectoryFiles(new File(directory));

        File genSetGenesFile = new File(directory + "/GeneSetVennMasterGenes.txt");
        genSetGenesFile.delete();

        File genSetFile = new File(directory + "/GeneSetVennMaster.txt");
        genSetFile.delete();

        for (File file : files) {
            String name = file.getName();
            if (name.contains("LOOGenesInOrder") && !name.contains("forErmine") && !name.contains("topGenes")
                    && !name.contains(".xls")) {
                RankedGeneListLoader loaderIn = new RankedGeneListLoader(file.toString());
                String topGenesFileName = loaderIn.writeEndSetGenes(keepSign);
                String SymbolsFilename = loaderIn.forErmineJ();

                // reload
                RankedGeneListLoader NCBIloaderIn = new RankedGeneListLoader(file.toString());
                String NBCIFileName = NCBIloaderIn.forErmineJNCBI();

                // runErmineJORA( loaderIn.threshold, SymbolsFilename, false );
                runErmineJORA(loaderIn.threshold, NBCIFileName, true);

                // generate hascoronal list
                RankedGeneListLoader hasCoronal = new RankedGeneListLoader(file.toString());
                hasCoronal.filename += ".hasCoronal";
                hasCoronal.retainHasCoronal();
                hasCoronal.forErmineJNCBI();

                // generate coronal list
                RankedGeneListLoader coronal = new RankedGeneListLoader(file.toString());
                coronal.filename += ".coronal";
                coronal.retainCoronal();
                coronal.forErmineJNCBI();

                RankedGeneListLoader loaderTop = new RankedGeneListLoader(topGenesFileName);
                TopTenInfo topTen = new TopTenInfo(loaderTop, pair);
                topTen.writeExpressionInfo();

                String setName = name.substring(16, name.indexOf(".txt"));
                Util.addToVennMasterFile(genSetFile, loaderTop.lines, setName);

                // without IDS
                loaderTop.removeImageIDs();
                Util.addToVennMasterFile(genSetGenesFile, loaderTop.lines, setName);

            }
        }
    }

    public static void runErmineJORA(double threshold, String list_filename, boolean NCBI) throws Exception {
        runErmineJORA(threshold, list_filename, NCBI, 5, 200);
    }

    public static void runErmineJORA(double threshold, String list_filename, boolean NCBI, int minSize, int maxSize)
            throws Exception {
        // erg, so annoying
        List<String> commandParts = new LinkedList<String>();
        commandParts.add(SetupParameters.config.getString("abams.ermineJ.bin"));
        if (NCBI) {
            commandParts.add("--annots");
            commandParts.add("\"" + SetupParameters.config.getString("abams.ermineJ.NCBIannots") + "\"");
        } else {
            commandParts.add("--annots");
            commandParts.add("\"" + SetupParameters.config.getString("abams.ermineJ.annots") + "\"");
        }
        commandParts.add("--config"); // config from GUI
        commandParts.add("\"" + SetupParameters.config.getString("abams.ermineJ.baseConfig") + "\"");

        commandParts.add("--classFile");
        commandParts.add("\"" + SetupParameters.config.getString("abams.ermineJ.classFile") + "\"");
        commandParts.add("--output");
        commandParts.add("\"" + list_filename + ".ORA.tsv\"");
        commandParts.add("--threshold");
        commandParts.add("" + threshold);
        commandParts.add("--test");
        commandParts.add("0");
        commandParts.add("--scoreFile");
        commandParts.add("\"" + list_filename + "\"");
        commandParts.add("--maxClassSize");
        commandParts.add("" + maxSize);
        commandParts.add("--minClassSize");
        commandParts.add("" + minSize);
        commandParts.add("-j"); // show genes

        // log.info( command );
        // execute!
        log.info(commandParts);
        Process p = Runtime.getRuntime().exec((String[]) commandParts.toArray(new String[commandParts.size()]));
        log.info("Waiting for ermineJ");

        StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), "ErmineJ");
        outputGobbler.start();
        StreamGobbler errGobbler = new StreamGobbler(p.getErrorStream(), "ErmineJ");
        errGobbler.start();
        p.waitFor();

    }

    static class StreamGobbler extends Thread {
        InputStream is;
        String type;

        StreamGobbler(InputStream is, String type) {
            this.is = is;
            this.type = type;
        }

        public void run() {
            try {
                InputStreamReader isr = new InputStreamReader(is);
                BufferedReader br = new BufferedReader(isr);
                String line = null;
                while ((line = br.readLine()) != null)
                    System.out.println(type + ">" + line);
            } catch (IOException ioe) {
                ioe.printStackTrace();
            }
        }
    }

    @Deprecated
    public static void processDirectory(String directory, boolean hasNonExp) throws Exception {
        Collection<File> files = FileTools.listDirectoryFiles(new File(directory));
        for (File file : files) {
            RankedGeneListLoader loaderIn = new FullRankedGeneListLoader(file.toString(), hasNonExp);
            loaderIn.forErmineJ();
        }
    }

    /**
     * @param args
     */

    public String getTotalExpresion() throws Exception {
        String result = "";
        AllenCatalogMatrices2 allenMatrices = new AllenCatalogMatrices2();
        DoubleMatrix<String, String> dataMatrix = allenMatrices.getFromDisk("NewEnergies");

        for (String geneRow : lines) {
            double[] values = dataMatrix.getRowByName(geneRow);
            double sum = 0;
            for (double d : values)
                sum += d;
            result += geneRow + "\t" + sum + System.getProperty("line.separator");
        }

        return result;
    }

    public void retainHasCoronal() throws Exception {
        ImageSeriesInfoLoader loader = new ImageSeriesInfoLoader();
        List<String> newLines = new LinkedList<String>();
        for (String row : lines) {
            boolean hasCoronal = loader.hasCoronalImageFromRowName(row);
            if (hasCoronal)
                newLines.add(row);
        }
        this.lines = newLines;
    }

    public void retainCoronal() throws Exception {
        ImageSeriesInfoLoader loader = new ImageSeriesInfoLoader();
        List<String> newLines = new LinkedList<String>();
        for (String row : lines) {
            String planeString = loader.getPlaneFromRowName(row);
            if (planeString.equals("coronal"))
                newLines.add(row);
        }
        this.lines = newLines;
    }

    public static void main(String[] args) throws Exception {

        // RankedGeneListLoader jesseIdea = new RankedGeneListLoader(
        // "/grp/java/workspace/BAMSandAllen/data/rankedGenes/ranked once/Incoming.partialcon.genes.txt" );

        boolean removeNonExp = true;
        boolean useVirtual = true;
        boolean keepSign = true;

        // ERG, gene info top ten results varies for direction
        Direction direction = AnalyzeBAMSandAllenGenes.Direction.ANYDIRECTION;

        ConnectivityAndAllenExpressionMatrixPair pair = ExpressionMatrixPairFactory
                .connectivityAndExpression(direction, useVirtual, removeNonExp);
        // ConnectivityAndAllenExpressionMatrixPair pair = ExpressionMatrixPairFactory.connectivityPartial( direction,
        // false, RegressMatrix.CONNECTIVITY, useVirtual, removeNonExp, true );

        // need a partialcon pair
        // jesseIdea.writeCorrelationBasedData( pair );
        // System.exit( 1 );
        // pair.applyGeneFilter( new PrefixGeneFilter( "Drd" ) );
        // log.info( "Genes:" + Util.getUniqueGenes( pair.getMatrixBDataRows() ) );
        // log.info( pair.getCorrelation() );
        // pair.writeImages();
        //
        // System.exit( 1 );

        // processDirectoryNoMissing( "/grp/java/workspace/BAMSandAllen/data/rankedGenes/direct increasing", pair );

        // processDirectoryNoMissing( "/grp/java/workspace/BAMSandAllen/data/rankedGenes/near final nobed", pair );
        processDirectoryNoMissing("/grp/java/workspace/BAMSandAllen/data/rankedGenes/LitCon", pair);
        System.exit(1);

        // LOOGenesInOrder.out.partialcon.txt.329.0.014448.topGenes.txt
        // LOOGenesInOrder.in.partialcon.txt.410.0.018005.topGenes.txt
        // LOOGenesInOrder.space.7.txt.435.0.019093.topGenes.txt

        /*
         * RankedGeneListLoader aLook = new RankedGeneListLoader(
         * "/grp/java/workspace/BAMSandAllen/data/rankedGenes/near final
         * nobed/LOOGenesInOrder.space.7.txt.435.0.019093.topGenes.txt" );
         */
        // ConnectivityAndAllenExpressionMatrixPair.NewEnergies.incoming.rOrder
        // ConnectivityAndAllenExpressionMatrixPair.NewEnergies.space.rOrder
        // ConnectivityAndAllenExpressionMatrixPair.NewEnergies.outgoing.rOrder
        // LOOGenesInOrder.out.partialcon.txt.329.0.014448.topGenes.txt
        // LOOGenesInOrder.in.partialcon.txt.410.0.018005.topGenes.txt
        // LOOGenesInOrder.space.7.txt.435.0.019093.topGenes.txt
        RankedGeneListLoader aLook = new RankedGeneListLoader(
                "/grp/java/workspace/BAMSandAllen/data/rankedGenes/near final nobed/LOOGenesInOrder.out.partialcon.txt.329.0.014448.topGenes.txt");
        pair.setMatrixBDataRows(aLook.lines);
        // log.info( pair.getCorrelation() );
        // pair.writeRMatrices();
        // pair.writeImages();
        pair.orderDataRows(aLook.lines);
        log.info(pair.getCorrelation());
        System.exit(1);
        pair.writeRMatrices();
        pair.writeImages();

        ExploreRegionNames explore = new ExploreRegionNames(pair);
        StringToStringSetMap parents = explore.getParents();

        String focusRegion = "Midbrain";
        Set<String> ROIs = parents.get(focusRegion);
        // some may have no exp
        ROIs.retainAll(pair.getAllenDataColNames());
        pair.removeAllenCols(ROIs);
        // Hindbrain
        // Interbrain
        // Midbrain
        // Cerebrum
        pair.run();

        log.info(focusRegion + ":" + pair.getCorrelation());
        pair.test(1000);

        System.exit(1);

        List<String> lines = aLook.lines;
        lines.remove(0);
        lines.remove(1);
        lines.remove(2);
        lines.remove(3);
        lines.remove(4);
        lines.remove(5);
        lines.remove(6);
        lines.remove(7);
        lines.remove(8);
        lines.remove(9);
        pair.setMatrixBDataRows(aLook.lines);
        log.info(pair.getCorrelation());
        log.info(lines.size());

        // processDirectoryNoMissing( "/grp/java/workspace/BAMSandAllen/data/rankedGenes/for Ray", pair );
        // processDirectoryNoMissing( "/grp/java/workspace/BAMSandAllen/data/rankedGenes/near final 962 regions", pair
        // );
        // processDirectoryNoMissing( "/grp/java/workspace/BAMSandAllen/data/rankedGenes/decreasing after lab meeting",
        // pair, keepSign );

    }
}