Java tutorial
/* * Copyright 2013 University of Glasgow. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package broadwick.phylo; import broadwick.graph.Edge; import broadwick.graph.Tree; import broadwick.io.FileInput; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; /** * Parser for a Newick tree. This has limited functionality and only works for Newick trees <ul> <li> with unique node * names (e.g. one node can be unnamed but the rest must be uniquely named)</li> <li> with defined branch lengths (e.g. * (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F:0.0;)</li> </ul> */ @Slf4j public class NewickTreeParser { /** * Create the parser. * @param newickFile the name of the file containing the tree. */ public NewickTreeParser(final String newickFile) { try (FileInput file = new FileInput(newickFile)) { newickString = file.read(); } catch (IOException ex) { log.error("Could not read Newick file {}. {}", newickFile, ex.getLocalizedMessage()); } } /** * Create the parser. */ public NewickTreeParser() { newickString = null; } /** * Parse a given string that MUST be a valid Newick format. * @param newickStr the string to parse. * @return a Tree created from the parsed string. */ public final Tree<PhyloNode, Edge<PhyloNode>> parse(final String newickStr) { branchLabel = 0; final Tree<PhyloNode, Edge<PhyloNode>> phyloTree = new Tree<>(); // first remove the trailing ; if any String stringToParse = StringUtils.removeEnd(newickStr.trim(), ";").trim(); if (stringToParse.charAt(0) == '(' && getClosingParenthesis(stringToParse) == stringToParse.length() - 1) { stringToParse = stringToParse.substring(1, stringToParse.length() - 1); } // extract the information for the root node, this will be after the last closing parenthesis iff there is // a colon (TODO is this right? e.g. (A,B,(C,D),E)F; ) double distance = 0.0; String rootName = ""; final int rparen = stringToParse.lastIndexOf(')'); final int rcolon = stringToParse.lastIndexOf(':'); if (rparen < rcolon) { distance = Double.parseDouble(stringToParse.substring(rcolon + 1)); rootName = stringToParse.substring(rparen + 1, rcolon); stringToParse = stringToParse.substring(0, rcolon - rootName.length()).trim(); // adjust the string to exclude the distance } // create a new node for this root and add it to the tree. final PhyloNode root = new PhyloNode(rootName, distance); log.trace("Parsing Newick file: Adding root node {}", root); phyloTree.addVertex(root); // if we don't have a comma we are just have the root node if (stringToParse.indexOf(',') == -1) { return phyloTree; } if (stringToParse.charAt(0) == '(' && getClosingParenthesis(stringToParse) == stringToParse.length() - 1) { stringToParse = stringToParse.substring(1, stringToParse.length() - 1); } parseString(stringToParse, root, phyloTree); return phyloTree; } /** * Parse a string from a Newick file. * @return a generated Tree object. */ public final Tree<PhyloNode, Edge<PhyloNode>> parse() { if (newickString == null) { throw new IllegalArgumentException( "NewickParser created but no string to parse found. Did you create the parser without specifying a file?"); } return parse(newickString); } /** * Parse a string in Newick format (where the root node has been removed) recursively. * @param stringToParse the string to be parsed. * @param parent the parent node for the string (initially this is the root node but will be a brach node if * this method is called recursively). * @param tree the tree to which the created nodes will be attached. */ private void parseString(final String stringToParse, final PhyloNode parent, final Tree<PhyloNode, Edge<PhyloNode>> tree) { for (String node : findNodes(stringToParse)) { parseNode(node, tree, parent); } } /** * Parse a tree splitting it into top level nodes and subtrees. * @param tree the tree tp be parsed. * @return a collection of nodes and subtrees contained in the parsed string */ private Collection<String> findNodes(final String tree) { final Collection<String> nodes = new ArrayList<>(); int start = 0; int depth = 0; boolean isOk = true; for (int i = 0; i < tree.length(); i++) { final char charAt = tree.charAt(i); if (charAt == '(') { depth++; isOk = false; } if (charAt == ')' && --depth == 0) { isOk = true; } if (charAt == ',' && isOk) { nodes.add(tree.substring(start, i).trim()); start = i + 1; } } nodes.add(tree.substring(start).trim()); return nodes; } /** * Get the index of the closing parenthesis in a string that starts with an opening parenthesis. If the string does * not start with an opening parenthesis then an IllegalArgumentException is thrown. The algorithm for finding the * closing parenthesis is simple - keep reading the string incrementing a count of the number of opening parentheses * we find and decrementing the count if we encounter a closing parenthesis. When we read a closing parenthesis AND * the count becomes zero we have our closing parenthesis. * @param strng the string to examine. * @return the index in the string of the closing parenthesis, -1 if none is found. */ private int getClosingParenthesis(final String strng) { if (!strng.trim().startsWith("(")) { throw new IllegalArgumentException( String.format("Illegal Argument [%s] does not start with an opening parenthesis", strng)); } int depth = 0; for (int i = 0; i < strng.length(); i++) { if (strng.charAt(i) == '(') { depth++; } if (strng.charAt(i) == ')' && (--depth == 0)) { return i; } } return -1; } /** * Parse a string containing information on a node in Newick format and attach it to a given tree. * @param node the string containing the node information. * @param tree the tree to which the created node will be attached. * @param parent the node on the tree to which the new node will be created. */ private void parseNode(final String node, final Tree<PhyloNode, Edge<PhyloNode>> tree, final PhyloNode parent) { log.trace("parsing {} from parent node {} ", node, parent.getId()); if (node.charAt(0) == '(') { // this is a branch so create a branch node and set the parent final int rparen = node.lastIndexOf(')'); final PhyloNode branchNode = addNodeToTree(node.substring(rparen + 1), tree, parent, true); for (String nd : findNodes(node.substring(1, rparen))) { parseNode(nd, tree, branchNode); } } else if (node.indexOf(',') != -1) { for (String nd : findNodes(node)) { parseNode(nd, tree, parent); } } else { // this is a single node addNodeToTree(node, tree, parent, false); } } /** * Given a string that represents a node in the tree, split in into its constituent name and distance components and * add it to the tree. If the string is empty then a dummy node is added, this is useful fro creating (unnamed) * branches. * @param node the string representing the node. * @param tree the tree object on which the node will be placed. * @param parent the parent node to which this node is attached. * @param createUniqueName if the name of the node is not unique then add create a unique one if this is true. * @return the node added to the tree. */ private PhyloNode addNodeToTree(final String node, final Tree<PhyloNode, Edge<PhyloNode>> tree, final PhyloNode parent, final boolean createUniqueName) { log.trace("Adding {} to tree at {}.", node, parent); PhyloNode phyloNode; final int lcolon = node.indexOf(':'); String nodeName; double distance = 0.0; if (node.isEmpty()) { // no details so use default values. nodeName = String.format("branch-%d", branchLabel++); distance = 0.0; } else if (lcolon == -1) { // we just have a node name and no distance. distance = 0.0; nodeName = node; } else { // we have a node name and a distance. distance = Double.parseDouble(node.substring(lcolon + 1)); nodeName = node.substring(0, lcolon); } phyloNode = new PhyloNode(nodeName, distance); try { tree.addEdge(new Edge<>(parent, phyloNode, distance), parent, phyloNode); } catch (IllegalArgumentException e) { // we may have non-unique branch names in the data file, if we encounter one here and we want to rename // it we do it now. This is not very good as we are searching for an error message but it's the best as can // be done. if (e.getLocalizedMessage().contains("already exists in this graph") && createUniqueName) { nodeName = String.format("branch-%d", branchLabel++); phyloNode = new PhyloNode(nodeName, distance); tree.addEdge(new Edge<>(parent, phyloNode), parent, phyloNode); } } return phyloNode; } private String newickString; private int branchLabel = 0; }