Java tutorial
/* * Copyright (c) 2007-2012 The Broad Institute, Inc. * SOFTWARE COPYRIGHT NOTICE * This software and its documentation are the copyright of the Broad Institute, Inc. All rights are reserved. * * This software is supplied without any warranty or guaranteed support whatsoever. The Broad Institute is not responsible for its use, misuse, or functionality. * * This software is licensed under the terms of the GNU Lesser General Public License (LGPL), * Version 2.1 which is available at http://www.opensource.org/licenses/lgpl-2.1.php. */ package org.broad.igv.cbio; import biz.source_code.base64Coder.Base64Coder; import org.apache.commons.collections.Predicate; import org.apache.log4j.Logger; import org.broad.igv.DirectoryManager; import org.broad.igv.Globals; import org.broad.igv.PreferenceManager; import org.broad.igv.feature.FeatureDB; import org.broad.igv.feature.NamedFeature; import org.broad.igv.track.RegionScoreType; import org.broad.igv.track.Track; import org.broad.igv.ui.panel.FrameManager; import org.broad.igv.ui.panel.ReferenceFrame; import org.broad.igv.util.*; import org.jgrapht.EdgeFactory; import org.jgrapht.graph.DirectedMultigraph; import org.w3c.dom.*; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.*; import java.util.*; import java.util.zip.GZIPOutputStream; /** * Fetch a gene network from cBio portal. WORK IN PROGRESS. * User: jacob * Date: 2012/01/31 */ public class GeneNetwork extends DirectedMultigraph<Node, Node> { private Logger log = Logger.getLogger(GeneNetwork.class); public static final String NODE_TAG = "node"; public static final String EDGE_TAG = "edge"; public static final String KEY = "key"; public static final String LABEL = "label"; /** * URL that cbio will use when service is released */ static final String REAL_URL = "http://www.cbioportal.org/public-portal/network.do"; /** * URL they use for testing */ static final String TEST_URL = "http://awabi.cbio.mskcc.org/public-portal/network.do"; static String BASE_URL = REAL_URL; // static{ // InputStream is = NetworkAnnotator.class.getResourceAsStream("resources/url.txt"); // BufferedReader br = new BufferedReader(new InputStreamReader(is)); // try{ // BASE_URL = br.readLine(); // }catch(IOException e){ // logger.error("url resource not found, defaulting to " + TEST_URL); // BASE_URL = TEST_URL; // } // } private static final String common_parms = "format=gml&gzip=on"; private static final String GENE_LIST = "gene_list"; private Map<String, Node> nodeTable; private List<Node> schema = new ArrayList<Node>(); private NamedNodeMap graphAttr; private Document origDocument; static Map<String, RegionScoreType> attributeMap = new LinkedHashMap(); /** * For each score type, we are only interested in the fraction of samples which are * modified in some way. We set thresholds here, in the form of [min max] */ private static Map<String, float[]> bounds = new HashMap<String, float[]>(8); private Set<Node> rejectedNodes = new HashSet<Node>(); private Set<Node> rejectedEdges = new HashSet<Node>(); private boolean filtersFinalized; static { attributeMap.put("PERCENT_MUTATED", RegionScoreType.MUTATION_COUNT); attributeMap.put("PERCENT_CNA_AMPLIFIED", RegionScoreType.AMPLIFICATION); attributeMap.put("PERCENT_CNA_HOMOZYGOUSLY_DELETED", RegionScoreType.DELETION); attributeMap.put("PERCENT_MRNA_WAY_UP", RegionScoreType.EXPRESSION); attributeMap.put("PERCENT_MRNA_WAY_DOWN", RegionScoreType.EXPRESSION); } /** * The thresholds determine whether a particular sample is altered in that fashion. * The number is calculated using track.getRegionScore * * Mutation will be the number of mutations in a given gene (integer) * * For the others, we take the numerical value that has been read from the data source, * and average it over the region of each gene. * */ static { float max_val = 2 << 10; float mut = Float .parseFloat(PreferenceManager.getInstance().get(PreferenceManager.CBIO_MUTATION_THRESHOLD)); bounds.put("PERCENT_MUTATED", new float[] { mut, max_val }); //See GISTIC supplement, page 20 float amp = Float .parseFloat(PreferenceManager.getInstance().get(PreferenceManager.CBIO_AMPLIFICATION_THRESHOLD)); bounds.put("PERCENT_CNA_AMPLIFIED", new float[] { amp, max_val }); float del = Float .parseFloat(PreferenceManager.getInstance().get(PreferenceManager.CBIO_DELETION_THRESHOLD)); bounds.put("PERCENT_CNA_HOMOZYGOUSLY_DELETED", new float[] { del, max_val }); //See GISTIC supplement, page 5, just gives greater than or less than 0 float expUp = Float .parseFloat(PreferenceManager.getInstance().get(PreferenceManager.CBIO_EXPRESSION_UP_THRESHOLD)); bounds.put("PERCENT_MRNA_WAY_UP", new float[] { expUp, max_val }); float expDown = Float .parseFloat(PreferenceManager.getInstance().get(PreferenceManager.CBIO_EXPRESSION_DOWN_THRESHOLD)); bounds.put("PERCENT_MRNA_WAY_DOWN", new float[] { -max_val, -expDown }); } public static final String PERCENT_ALTERED = "PERCENT_ALTERED"; private String sourcePath; /** * Generally for testing. The path from which data was loaded. * * @return */ String getSourcePath() { return sourcePath; } // protected KeyFactory edgeKeyFactory; // protected KeyFactory vertexKeyFactory; // protected List<KeyFactory> factoryList; public GeneNetwork() { this(Node.class); } public GeneNetwork(EdgeFactory edgeFactory) { super(edgeFactory); } public GeneNetwork(Class clazz) { super(clazz); // // edgeKeyFactory = new KeyFactory("edge"); // vertexKeyFactory = new KeyFactory("node"); // factoryList = Arrays.asList(edgeKeyFactory, vertexKeyFactory); } // public List<KeyFactory> getFactoryList() { // return this.factoryList; // } /** * Applies {@code predicate} to every element in {@code object}, and adds * any which return false to {@code rejectSet}. Intended for soft filtering. * There is an override here, where we never filter out a Node which * is marked as being part of the query * * @param predicate * @param objects * @param rejectSet * @return * @throws IllegalStateException If the filters have been finalized. */ private int filter(Predicate predicate, Iterable<Node> objects, Set rejectSet) { if (filtersFinalized) { throw new IllegalStateException("Cannot filter after filtering has been finalized"); } int filtered = 0; for (Node v : objects) { String in_query = getNodeAttrValue(v, KEY, "IN_QUERY"); if (Boolean.parseBoolean(in_query)) { continue; } if (!predicate.evaluate(v)) { filtered += rejectSet.add(v) ? 1 : 0; } } return filtered; } public int filterNodes(Predicate predicate) { return this.filter(predicate, this.vertexSet(), rejectedNodes); } public int filterNodesRange(final String key, final float min, final float max) { Predicate<Node> pred = new Predicate<Node>() { @Override public boolean evaluate(Node object) { String sval = getNodeKeyData(object, key); if (sval != null) { float fval = Float.parseFloat(sval); return fval >= min && fval <= max; } return false; } }; return this.filterNodes(pred); } public int filterEdges(Predicate predicate) { return this.filter(predicate, this.edgeSet(), rejectedEdges); } /** * Returns a set of nodes which have not been rejected by the filter * * @return */ public Set<Node> vertexSetFiltered() { Set<Node> filteredSet = new HashSet<Node>(this.vertexSet()); filteredSet.removeAll(rejectedNodes); return filteredSet; } /** * Return the set of edges connected to this node, whose neighbors * are not filtered out. If this node is filtered out, will * return the empty set. * * @param n * @return */ public Set<Node> edgesOfFiltered(Node n) { if (rejectedNodes.contains(n)) { return new HashSet<Node>(0); } Set<Node> filteredEdges = new HashSet<Node>(this.edgesOf(n).size()); for (Node edge : this.outgoingEdgesOf(n)) { if (!rejectedNodes.contains(this.getEdgeTarget(edge)) && !rejectedEdges.contains(edge)) { filteredEdges.add(edge); } } for (Node edge : this.incomingEdgesOf(n)) { if (!rejectedNodes.contains(this.getEdgeSource(edge)) && !rejectedEdges.contains(edge)) { filteredEdges.add(edge); } } return filteredEdges; } /** * Actually apply filters irrevocably, removing edges * and nodes from the graph * * @return True iff anything was removed due to filtering. * Filters only get finalized if this call has an effect. So if * this returns true, it means at least 1 node or edge has been removed, * and the filters are finalized. If it returns false, nothing was removed * and the graph is not finalized. */ boolean finalizeFilters() { if (filtersFinalized) { throw new IllegalStateException("Cannot finalize filters twice"); } boolean rejected = this.removeAllVertices(rejectedNodes); rejected |= this.removeAllEdges(rejectedEdges); this.clearAllFilters(); this.filtersFinalized = rejected; return rejected; } public void clearNodeFilters() { if (filtersFinalized) { throw new IllegalStateException("Cannot clear filters after they have been finalized"); } this.rejectedNodes.clear(); } public void clearEdgeFilters() { if (filtersFinalized) { throw new IllegalStateException("Cannot clear filters after they have been finalized"); } this.rejectedEdges.clear(); } public void clearAllFilters() { this.clearNodeFilters(); this.clearEdgeFilters(); } public boolean pruneGraph() { Predicate min_connections = new Predicate<Node>() { public boolean evaluate(Node object) { return edgesOf(object).size() >= 1; } }; return this.filterNodes(min_connections) > 0; } /** * Hash the url to get a file name, locate it in temp directory * * @param url * @return */ static File getCachedFile(String url) { String cachedFileName = Math.abs(url.hashCode()) + "_tmp.xml"; File cachedFile = new File(DirectoryManager.getCacheDirectory(), cachedFileName); return cachedFile; } static String getURLForGeneList(Iterable<String> geneList) { String query = HttpUtils.buildURLString(geneList, "+"); String url = BASE_URL + "?" + GENE_LIST + "=" + query + "&" + common_parms; return url; } public static GeneNetwork getFromCBIO(Iterable<String> geneList) throws IOException { String url = getURLForGeneList(geneList); File cachedFile = getCachedFile(url); if (cachedFile.exists()) { url = cachedFile.getAbsolutePath(); } GeneNetwork network = new GeneNetwork(); network.loadNetwork(url); return network; } /** * Load graphml data from provide path. * * @param path * @throws IOException If something went wrong loading data. * Note we wrap other exception types in this. */ public int loadNetwork(String path) throws IOException { String error = null; int numNodes = -1; try { InputStream cbioStream = ParsingUtils.openInputStreamGZ(new ResourceLocator(path)); this.sourcePath = path; Document document = Utilities.createDOMDocumentFromXmlStream(cbioStream); //Cache the file if (HttpUtils.isRemoteURL(path)) { File cacheFile = getCachedFile(path); try { this.exportDocument(document, cacheFile.getAbsolutePath()); } catch (IOException e) { //No biggy, we just don't cache the file log.error("Error caching file: " + e); cacheFile.delete(); } cacheFile.deleteOnExit(); } this.origDocument = document; //Read schema from top and save it addToSchema(document.getElementsByTagName("key")); graphAttr = document.getElementsByTagName("graph").item(0).getAttributes(); NodeList nodes = document.getElementsByTagName(NODE_TAG); //Generate the graph itself. First add the nodes, then the edges int docNodes = nodes.getLength(); nodeTable = new HashMap<String, Node>(docNodes); for (int nn = 0; nn < docNodes; nn++) { Node node = nodes.item(nn); String label = node.getAttributes().getNamedItem("id").getTextContent(); nodeTable.put(label, node); this.addVertex(node); } NodeList edges = document.getElementsByTagName(EDGE_TAG); int docEdges = edges.getLength(); for (int ee = 0; ee < docEdges; ee++) { Node edge = edges.item(ee); NamedNodeMap attrs = edge.getAttributes(); String source = attrs.getNamedItem("source").getTextContent(); String target = attrs.getNamedItem("target").getTextContent(); this.addEdge(nodeTable.get(source), nodeTable.get(target), edge); } numNodes = this.vertexSet().size(); } catch (ParserConfigurationException e) { throw new IOException(e.getMessage()); } catch (SAXException e) { throw new IOException(e.getMessage()); } return numNodes; } /** * Add schema information for the provided datakeys. * They will all be set as the provided dataType (string, double, float, etc.) * and graph element * * @param dataKeys * @param dataType Legal values are long, integer, float, double, boolean, string. Case insensitive. * All dataKeys will be set to the provided type. * @param typeFor Legal values are "all", "graph", "node", "edge" */ private void addSchema(Collection<String> dataKeys, String dataType, String typeFor) { Element key; for (String dataKey : dataKeys) { key = this.origDocument.createElement("key"); key.setAttribute("id", dataKey); //TODO id is supposed to unique, attr.name human readable. //Not quite sure of any reason they can't be the same. key.setAttribute("attr.name", dataKey); key.setAttribute("attr.type", dataType.toLowerCase()); if (typeFor != null) { key.setAttribute("for", typeFor); } schema.add(key); } } private void addToSchema(NodeList keys) { for (int kk = 0; kk < keys.getLength(); kk++) { schema.add(keys.item(kk)); } } /** * The the value of a child node by the key. * If there are multiple matches, the first is returned. * Search is not recursive. * <p/> * <p/> * Example: Say that node has the following XML * "<node id="3725"/> * <data key="label">JUN</data> * <data key="type">Protein</data> * <data key="RELATIONSHIP_XREF">HGNC:JUN;Entrez Gene:3725</data> * <data key="IN_QUERY">false</data> * </node>" * So getNodeKeyData(node, "key", "label") returns "JUN". * * @param node * @param attrName * @param attrValue * @return String value of key found. null if not found */ public static String getNodeAttrValue(Node node, String attrName, String attrValue) { NodeList elements = node.getChildNodes(); for (int ee = 0; ee < elements.getLength(); ee++) { Node el = elements.item(ee); try { NamedNodeMap map = el.getAttributes(); Node label = map.getNamedItem(attrName); String textContent = label.getTextContent(); if (textContent.compareToIgnoreCase(attrValue) == 0) { return el.getTextContent(); } } catch (NullPointerException e) { //In general these get hit due to newlines and such //We simply skip continue; } } return null; } /** * Gets the value of a child node with "key" attribute * equal to {@code key} parameter. * Equal to getNodeAttrValue(node, NetworkAnnotator.KEY, key); * * @param node Node to search * @param key Key to search for * @return String value of key found. null if not found * @see #getNodeAttrValue */ public static String getNodeKeyData(Node node, String key) { return getNodeAttrValue(node, KEY, key); } /** * Get the GraphML from this document. Filters will be finalized when this is called, * if they aren't already. * * @return */ public Document createDocument() { if (!filtersFinalized) { this.finalizeFilters(); } try { // Create a DOM document DocumentBuilder documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); Document document = documentBuilder.newDocument(); document.setStrictErrorChecking(false); // Global root element Element globalElement = document.createElement("graphml"); //Add schema and attributes for (Node node : schema) { globalElement.appendChild(node); } Element graphEl = document.createElement("graph"); for (int aa = 0; aa < graphAttr.getLength(); aa++) { Node attr = graphAttr.item(aa); graphEl.setAttribute(attr.getNodeName(), attr.getTextContent()); } //Add nodes and edges for (Node v : this.vertexSet()) { graphEl.appendChild(v); } for (Node e : this.edgeSet()) { graphEl.appendChild(e); } globalElement.appendChild(graphEl); document.appendChild(globalElement); return document; } catch (Exception e) { throw new RuntimeException("Error outputting graph"); } } public static int writeEncodedString(String string, OutputStream outputStream, boolean gzip, boolean base64encode) throws IOException { byte[] byteData; if (gzip) { ByteArrayOutputStreamChild gmlByteStream = new ByteArrayOutputStreamChild(string.length() / 20); GZIPOutputStream gzipOutputStream = new GZIPOutputStream(gmlByteStream); gzipOutputStream.write(string.getBytes()); gzipOutputStream.finish(); byteData = gmlByteStream.getBuf(); gmlByteStream.close(); } else { byteData = string.getBytes(); } int count = 0; if (base64encode) { char[] gmlData = Base64Coder.encode(byteData); for (char c : gmlData) { outputStream.write(c); count++; } } else { outputStream.write(byteData); outputStream.flush(); count += byteData.length; } outputStream.flush(); return count; } /** * Write document to XML at outputFile. File is deleted if there * is an error writing out. If the outputFile has a .gz extension, * the output is gzipped. * * @param document The Document to write out * @param outputPath * @return success * @throws java.io.IOException */ private int exportDocument(Document document, String outputPath) throws IOException { boolean gzip = outputPath.endsWith(".gz"); String xmlString = Utilities.getString(document); OutputStream stream = new FileOutputStream(outputPath); int count = writeEncodedString(xmlString, stream, gzip, false); stream.flush(); stream.close(); return count; } /** * Calls {@link #exportDocument} with the Document created from {@link #createDocument} * * @param outputFile * @return * @throws IOException */ int exportGraph(String outputFile) throws IOException { return exportDocument(this.createDocument(), outputFile); } public String outputForcBioView() throws IOException { String outPath = null; BufferedReader reader = null; OutputStream outputStream = null; try { File temp = File.createTempFile("cbio", ".html"); temp.deleteOnExit(); outPath = temp.getAbsolutePath(); InputStreamReader fReader = new InputStreamReader( GeneNetwork.class.getResourceAsStream("resources/post_stub.html")); reader = new BufferedReader(fReader); outputStream = new FileOutputStream(outPath); String line; while ((line = reader.readLine()) != null) { if (line.trim().equals("allthegraphmldatagoeshere")) { writeEncodedString(Utilities.getString(this.createDocument()), outputStream, true, true); } else { outputStream.write((line + FileUtils.LINE_SEPARATOR).getBytes()); outputStream.flush(); } } outputStream.flush(); } catch (IOException e) { log.error("Error writing cBio stub form to " + outPath); log.error(e.getMessage()); } finally { if (reader != null) { reader.close(); } if (outputStream != null) { outputStream.close(); } } return outPath; } /** * Add the data specified by the score-types to our * network, using data from the tracks. * <p/> * * @param tracks * @param nodeAttributes */ public void annotate(List<Track> tracks, Collection<String> nodeAttributes) { Set<Node> nodes = this.vertexSet(); String name; for (Node node : nodes) { name = getNodeKeyData(node, LABEL); ScoreData data = this.collectScoreData(name, tracks, nodeAttributes); //Don't add annotation if gene has no alteration? float relData = data.getPercentAltered(); if (relData == 0 && !Globals.isTesting()) { continue; } for (String attr : nodeAttributes) { Element newData = node.getOwnerDocument().createElement("data"); newData.setAttribute(KEY, attr); newData.setTextContent("" + data.get(attr)); node.appendChild(newData); } //Set total Element newData = node.getOwnerDocument().createElement("data"); newData.setAttribute(KEY, PERCENT_ALTERED); newData.setTextContent("" + data.getPercentAltered()); node.appendChild(newData); } addSchema(Arrays.asList(PERCENT_ALTERED), "float", "node"); addSchema(nodeAttributes, "float", "node"); } public ScoreData collectScoreData(String name, List<Track> tracks, Iterable<String> attributes) { int zoom = 0; List<NamedFeature> features = FeatureDB.getFeaturesList(name, Integer.MAX_VALUE); //If we are viewing a gene list, use the frame List<ReferenceFrame> frames = Globals.isHeadless() ? null : FrameManager.getFrames(); ReferenceFrame frame = Globals.isHeadless() ? null : FrameManager.getDefaultFrame(); if (frames != null) { for (ReferenceFrame frm : frames) { if (frm.getName().equalsIgnoreCase(name)) { frame = frm; } } } String frameName = frame != null ? frame.getName() : null; ScoreData<String, Float> results = new ScoreData(RegionScoreType.values().length); int initCapacity = tracks.size() / 10; //The names of all samples these tracks cover Set<String> allSamples = new HashSet<String>(initCapacity); //Each track/feature pair represents a region of a sample. //We store whether that sample has been altered in ANY way Set<String> anyAlteration = new HashSet<String>(initCapacity); //Set of samples which have data for this type Set<String> samplesForType = new HashSet<String>(initCapacity); //Set of samples which have been altered, using this type. Set<String> alteredSamplesForType = new HashSet<String>(initCapacity); for (String attr : attributes) { if (!bounds.containsKey(attr)) { throw new IllegalArgumentException("Have no bounds for " + attr); } RegionScoreType type = attributeMap.get(attr); float[] curBounds = bounds.get(attr); samplesForType.clear(); alteredSamplesForType.clear(); for (NamedFeature feat : features) { if (!name.equalsIgnoreCase(feat.getName())) { continue; } int featStart = feat.getStart(); int featEnd = feat.getEnd(); for (Track track : tracks) { if (!track.isVisible()) { continue; } String sample = track.getSample(); //If track is wrong type, or if sample has already been marked altered, //no further information can be gained if (!track.isRegionScoreType(type) || alteredSamplesForType.contains(sample)) { //if(alteredSamplesForType.contains(sample)) assert samplesForType.contains(sample); continue; } samplesForType.add(sample); float score = track.getRegionScore(feat.getChr(), featStart, featEnd, zoom, type, frameName, tracks); if (score >= curBounds[0] && score <= curBounds[1] && !Float.isNaN(score)) { alteredSamplesForType.add(sample); } } } allSamples.addAll(samplesForType); anyAlteration.addAll(alteredSamplesForType); float fractionAltered = ((float) alteredSamplesForType.size()) / samplesForType.size(); results.put(attr, fractionAltered); } results.setPercentAltered(((float) anyAlteration.size()) / allSamples.size()); return results; } public void annotateAll(List<Track> tracks) { this.annotate(tracks, attributeMap.keySet()); } public static class ScoreData<K, V> extends HashMap<K, V> { /** * Here we do not distinguish between any alteration value * or type. So 0,1,2,3 -> percentAltered = 3/4. * <p/> * Intended to represent the total fraction of samples with * ANY kind of alteration. So a sample that's mutated, amplified, * and upregulated would be counted once. */ private float percentAltered; public ScoreData(int size) { super(size); } public void setPercentAltered(float percentAltered) { this.percentAltered = percentAltered; } public float getPercentAltered() { return percentAltered; } } /** * Child class created so we can access the raw byte buffer, * and avoid making another copy. */ private static class ByteArrayOutputStreamChild extends ByteArrayOutputStream { public ByteArrayOutputStreamChild() { super(); } public ByteArrayOutputStreamChild(int size) { super(size); } public byte[] getBuf() { return this.buf; } } }