Our next goal is to create a newick string for the tree currently stored in a TreeManip
object. Add the makeNewick
function declaration in the public area of the TreeManip
class declaration, and add the makeNewick
function body just above the terminating curly bracket of the namespace (i.e. just before the end of the file):
#pragma once
#include <cassert>
#include <memory>
<span style="color:#0000ff"><strong>#include <stack></strong></span>
<span style="color:#0000ff"><strong>#include <boost/format.hpp></strong></span>
#include "tree.hpp"
namespace strom {
class TreeManip {
public:
TreeManip();
TreeManip(Tree::SharedPtr t);
~TreeManip();
void setTree(Tree::SharedPtr t);
Tree::SharedPtr getTree();
double calcTreeLength() const;
unsigned countEdges() const;
void scaleAllEdgeLengths(double scaler);
void createTestTree();
<span style="color:#0000ff"><strong>std::string makeNewick(unsigned precision, bool use_names = false) const;</strong></span>
void clear();
private:
Tree::SharedPtr _tree;
public:
typedef std::shared_ptr< TreeManip > SharedPtr;
};
//...
<span style="color:#0000ff"><strong>inline std::string TreeManip::makeNewick(unsigned precision, bool use_names) const {</strong></span>
<span style="color:#0000ff"><strong>std::string newick;</strong></span>
<span style="color:#0000ff"><strong>const boost::format tip_node_name_format( boost::str(boost::format("%%s:%%.%df") % precision) );</strong></span>
<span style="color:#0000ff"><strong>const boost::format tip_node_number_format( boost::str(boost::format("%%d:%%.%df") % precision) );</strong></span>
<span style="color:#0000ff"><strong>const boost::format internal_node_format( boost::str(boost::format("):%%.%df") % precision) );</strong></span>
<span style="color:#0000ff"><strong>std::stack<Node *> node_stack;</strong></span>
<span style="color:#0000ff"><strong></strong></span>
<span style="color:#0000ff"><strong>Node * root_tip = (_tree->_is_rooted ? 0 : _tree->_root);</strong></span>
<span style="color:#0000ff"><strong>for (auto nd : _tree->_preorder) {</strong></span>
<span style="color:#0000ff"><strong>if (nd->_left_child) {</strong></span>
<span style="color:#0000ff"><strong>newick += "(";</strong></span>
<span style="color:#0000ff"><strong>node_stack.push(nd);</strong></span>
<span style="color:#0000ff"><strong>if (root_tip) {</strong></span>
<span style="color:#0000ff"><strong>if (use_names) {</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(tip_node_name_format)</strong></span>
<span style="color:#0000ff"><strong>% root_tip->_name</strong></span>
<span style="color:#0000ff"><strong>% nd->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>} else {</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(tip_node_number_format)</strong></span>
<span style="color:#0000ff"><strong>% (root_tip->_number + 1)</strong></span>
<span style="color:#0000ff"><strong>% nd->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>newick += ",";</strong></span>
<span style="color:#0000ff"><strong>root_tip = 0;</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>else {</strong></span>
<span style="color:#0000ff"><strong>if (use_names) {</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(tip_node_name_format)</strong></span>
<span style="color:#0000ff"><strong>% nd->_name</strong></span>
<span style="color:#0000ff"><strong>% nd->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>} else {</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(tip_node_number_format)</strong></span>
<span style="color:#0000ff"><strong>% (nd->_number + 1)</strong></span>
<span style="color:#0000ff"><strong>% nd->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>if (nd->_right_sib)</strong></span>
<span style="color:#0000ff"><strong>newick += ",";</strong></span>
<span style="color:#0000ff"><strong>else {</strong></span>
<span style="color:#0000ff"><strong>Node * popped = (node_stack.empty() ? 0 : node_stack.top());</strong></span>
<span style="color:#0000ff"><strong>while (popped && !popped->_right_sib) {</strong></span>
<span style="color:#0000ff"><strong>node_stack.pop();</strong></span>
<span style="color:#0000ff"><strong>if (node_stack.empty()) {</strong></span>
<span style="color:#0000ff"><strong>newick += ")";</strong></span>
<span style="color:#0000ff"><strong>popped = 0;</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>else {</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(internal_node_format) % popped->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>popped = node_stack.top();</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>if (popped && popped->_right_sib) {</strong></span>
<span style="color:#0000ff"><strong>node_stack.pop();</strong></span>
<span style="color:#0000ff"><strong>newick += boost::str(boost::format(internal_node_format) % popped->_edge_length);</strong></span>
<span style="color:#0000ff"><strong>newick += ",";</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
<span style="color:#0000ff"><strong></strong></span>
<span style="color:#0000ff"><strong>return newick;</strong></span>
<span style="color:#0000ff"><strong>}</strong></span>
}
Note the convention that //...
indicates that possibly many lines have been omitted from the code snippet.
In the code shown above, you may have noticed that I’ve also indicated in blue, bold text two new headers that have been included near the top of the file. The <stack>
header allows us to use the standard stack
and the <boost/format.hpp>
header allows us to use the Boost format
function , which simplifies producing formatted strings.
makeNewick
function
The goal of this function is to turn a tree in memory into a newick
string representation using nested parentheses to indicate clades and subclades. Each left parenthesis encountered in the newick description signifies the start of a new clade (we are moving to the left child of the current node), each comma means we’re moving laterally to a sibling node, and a right parenthesis means we have reached the upper right node in a clade and are dropping back down to the parent of that node. The figure at right shows a newick description for 4-taxon rooted tree. The numbers at the nodes are the index of the node in the _preorder
vector. Note that the root node is not included in _preorder
, so there is no number shown in the figure for the root node.
The main loop in this function visits all nodes in the tree in preorder sequence:
for (auto nd : _tree->_preorder) {
//...
}
The auto nd : _tree->_preorder
code sets nd
to each element of _tree->_preorder
, in turn. The C++11 keyword auto
makes our life easy by determining the type of nd
automatically.
Internal nodes, when visited, result in the addition of a left parenthesis to the growing newick string. We will not add information about the internal node until we are finished with the clade defined by the node and have added the matching right parenthesis. Internal nodes are stored in the node_stack
. A stack is a container in which items can only be removed in the reverse order in which they were added to the stack, like a stack of documents (the last one added to the top is also the first one removed). It is necessary to store each internal node in the stack so that we can determine how many right parentheses to add to the newick string.
if (nd->_left_child) {
newick += "(";
node_stack.push(nd);
When a tip node is visited, the tip_node_name_format
or tip_node_number_format
string is used (depending on the argument passed into the parameter use_names
) to add the node name (if use_names
is true
) or a number (if use_names
is false
). The name or number is followed by a colon and then the edge length for that tip node.
const boost::format tip_node_name_format( boost::str(boost::format("%%s:%%.%df") % precision) );
const boost::format tip_node_number_format( boost::str(boost::format("%%d:%%.%df") % precision) );
The Boost library’s format class is used to format the string. Here we are creating two objects of type boost::format
(the boost
part is the Boost namespace, while format
is the class). The variables’ names are tip_node_name_format
and tip_node_number_format
. This works much like Python or sprintf in C: placeholders (single percent symbols followed by a letter such as d, f, or s) are substituted for the supplied integer, floating point value, or string, respectively. Each doubled percent (%%
) ends up being a single literal %
in the string after replacements, so the above statement will be equal to the following after replacement of the %d
by the supplied precision (assume that precision = 5 for this example):
const boost::format tip_node_format("%d:%.5f");
You might wonder why the boost::str(...)
is necessary. The boost::format
constructor takes a string as its sole argument, not a boost::format
object, and the Boost format library does not provide an implicit conversion of format objects to string objects, so the boost::str
function provides an explicit way to do this conversion. This is done intentionally in order to make it easier to report errors accurately when compiling formats. You could also call boost::format
’s str
function to accomplish the conversion:
const boost::format tip_node_number_format( (boost::format("%%d:%%.%df") % precision).str() );
The Boost format documentation provides some examples of using boost::format
.
If the tip node has a sibling to its right, a comma is output to the growing newick string. If the tip node does not have a right sibling, then things get interesting! Take a look at the figure above of a rooted tree and its associated newick description. Note that after the tip node C is output, two right parentheses (each followed by edge lengths) are output before the next node in the preorder sequence (tip node D) is handled. Why is this? It is because we must close up the definitions of two clades before moving on to tip node D. The first clade is the “cherry” comprising tip nodes B and C along with the internal node labeled 3. The second clade that must be finished comprises nodes labeled 1, 2, 3, 4 and 5. How do we know which edge lengths to spit out? The answer lies in our node_stack
. The last two nodes added to node_stack
are the internal nodes labeled 1 and 3 in the figure. When we reach the far corner of a clade (we know we are at the far corner when a node has no children and no siblings), we must pop nodes off the node_stack
(outputting a right parenthesis and edge length for each) until we reach an internal node with a right sibling: this sibling is necessarily the next node in the preorder sequence (in this case, tip node D, labeled 6 in the figure). That is what is happening in the large else
clause paired with if (nd->_right_sib)
in the code for the makeNewick
function. This else
clause is further complicated by the need to check whether we are done, which happens when we pop the last node off the node_stack
.
The stack function top()
returns the next object on the stack without actually removing it. The function pop()
deletes the object that is at the top of the stack. Note that we must always call top()
to save the node at the top of the stack before we call pop()
to delete it: it would not do to lose track of a node by pop()
ing it with out first saving a pointer to the node pop()
ed!
Finally, note that we add a comma to the newick string if the last internal node popped off the stack has a right sibling. The comma always indicates a sibling relationship for a node whether the node is a tip node or an internal node.
The figure of a 5-taxon unrooted tree on the right is subtly different from the figure above of a 4-taxon rooted tree. The root node in this case represents a tip (A), and this particular tip along with its edge length (which is really the _edge_length
belonging to its only child (the node labeled 0 in the figure) is saved to the newick string as if it were the leftmost child of the first node in the _preorder
vector. The part of the makeNewick
function that accomplishes this is contained in the if (root_tip) {...}
block of code. The variable root_tip
is 0 if the tree is rooted, but points to the root node if the tree is unrooted, so the code inside the if (root_node) {...}
conditional is only executed if root_tip
actually points to the root node. This code adds the number of the root node and a comma immediately after the opening left parenthesis. It also sets root_tip to 0 so that this block of code is not executed again.
While it is not essential to store unrooted trees by “rooting” them at a tip node, that is the convention that is followed throughout this tutorial. This points out a basic ambiguity inherent in newick tree descriptions. If you saw the newick string
(A:0.13,(B:0.2,(C:0.1,D:0.1):0.11):0.15,E:0.3)
how would you know this represents an unrooted binary tree and not a rooted tree with a polytomy (multifurcation) at the base? The answer is you could not know this unless the user who supplied you with the newick string told you.