The TreeSummary
class will read a tree file, keep track of the number of distinct tree topologies found therein, and count the number of trees having each unique topology.
This ability is essential for summarizing a posterior sample of trees. It allows one to construct the 95% credible set of tree topologies, for example, and sets the stage for constructing a majority rule consensus tree.
Create a new class named TreeSummary
and store it in a header file named tree_summary.hpp.
Here is the TreeSummary
class declaration. The class has 2 data members. The vector _newicks
stores the tree descriptions of all trees read from a tree file. The vector _treeIDs
stores a map that associates a key in the form of a tree ID (the set of all splits in a tree) with a value in the form of a vector of integers, where each integer in the vector is the index of a tree description stored in _newicks
. Each separate tree ID corresponds to a unique topology, and the associated vector allows us to count and, if desired, print out all trees having that topology.
#pragma once
#include <set>
#include <map>
#include <vector>
#include <fstream>
#include <cassert>
#include <algorithm>
#include <boost/format.hpp>
#include <boost/range/adaptor/reversed.hpp>
#include "split.hpp"
#include "tree_manip.hpp"
#include "xstrom.hpp"
#include "ncl/nxsmultiformat.h"
namespace strom {
class TreeSummary {
public:
TreeSummary();
~TreeSummary();
void readTreefile(const std::string filename, unsigned skip);
void showSummary() const;
typename Tree::SharedPtr getTree(unsigned index);
std::string getNewick(unsigned index);
void clear();
private:
Split::treemap_t _treeIDs;
std::vector<std::string> _newicks;
public:
typedef std::shared_ptr< TreeSummary > SharedPtr;
};
// insert member function bodies here
}
The constructor and destructor function do nothing except report that a TreeSummary
object was created or destroyed, respectively.
inline TreeSummary::TreeSummary() {
std::cout << "Constructing a TreeSummary" << std::endl;
}
inline TreeSummary::~TreeSummary() {
std::cout << "Destroying a TreeSummary" << std::endl;
}
This function returns a shared pointer to a tree built from the tree description stored in _newicks[index]
. If index is too large, an XStrom
exception is thrown.
inline Tree::SharedPtr TreeSummary::getTree(unsigned index) {
if (index >= _newicks.size())
throw XStrom("getTree called with index >= number of stored trees");
TreeManip tm;
// build the tree
tm.buildFromNewick(_newicks[index], false, false);
return tm.getTree();
}
inline std::string TreeSummary::getNewick(unsigned index) {
if (index >= _newicks.size())
throw XStrom("getNewick called with index >= number of stored trees");
return _newicks[index];
}
The clear function simply empties the _treeIDs
and _newicks
vectors.
inline void TreeSummary::clear() {
_treeIDs.clear();
_newicks.clear();
}
This function reads the tree file specified by filename
, skipping the first skip trees. We make use of the NCL’s MultiFormatReader
class to find all TAXA blocks. If the NEXUS-formatted tree file does not contain a TAXA block, the NCL will create one for the taxa that it finds in the TREES block. For each TAXA block found, the member function clear()
is called to reset the TreeSummary
object. (Typically, there will be only one TAXA block in each tree file, but if there happens to be more than one, only the last will be stored.) The readTreefile
function then stores each newick tree description found in _newicks
, and, for each, builds the tree using the TreeManip::buildFromNewick
function and uses TreeManip::storeSplits
to create a tree ID (called splitset
) for the tree. If the tree ID for this tree has never been seen, it creates a new entry in the _treeIDs
map for this tree ID and associates it with a vector of length 1 containing the index of this tree. If the tree ID has already been seen, it simply adds the current tree index to the vector already associated with that tree ID.
inline void TreeSummary::readTreefile(const std::string filename, unsigned skip) {
TreeManip tm;
Split::treeid_t splitset;
// See http://phylo.bio.ku.edu/ncldocs/v2.1/funcdocs/index.html for NCL documentation
MultiFormatReader nexusReader(-1, NxsReader::WARNINGS_TO_STDERR);
try {
nexusReader.ReadFilepath(filename.c_str(), MultiFormatReader::NEXUS_FORMAT);
}
catch(...) {
nexusReader.DeleteBlocksFromFactories();
throw;
}
int numTaxaBlocks = nexusReader.GetNumTaxaBlocks();
for (int i = 0; i < numTaxaBlocks; ++i) {
clear();
NxsTaxaBlock * taxaBlock = nexusReader.GetTaxaBlock(i);
std::string taxaBlockTitle = taxaBlock->GetTitle();
const unsigned nTreesBlocks = nexusReader.GetNumTreesBlocks(taxaBlock);
for (unsigned j = 0; j < nTreesBlocks; ++j) {
const NxsTreesBlock * treesBlock = nexusReader.GetTreesBlock(taxaBlock, j);
unsigned ntrees = treesBlock->GetNumTrees();
if (skip < ntrees) {
//std::cout << "Trees block contains " << ntrees << " tree descriptions.\n";
for (unsigned t = skip; t < ntrees; ++t) {
const NxsFullTreeDescription & d = treesBlock->GetFullTreeDescription(t);
// store the newick tree description
std::string newick = d.GetNewick();
_newicks.push_back(newick);
unsigned tree_index = (unsigned)_newicks.size() - 1;
// build the tree
tm.buildFromNewick(newick, false, false);
// store set of splits
splitset.clear();
tm.storeSplits(splitset);
// iterator iter will point to the value corresponding to key splitset
// or to end (if splitset is not already a key in the map)
Split::treemap_t::iterator iter = _treeIDs.lower_bound(splitset);
if (iter == _treeIDs.end() || iter->first != splitset) {
// splitset key not found in map, need to create an entry
std::vector<unsigned> v(1, tree_index); // vector of length 1 with only element set to tree_index
_treeIDs.insert(iter, Split::treemap_t::value_type(splitset, v));
}
else {
// splitset key was found in map, need to add this tree's index to vector
iter->second.push_back(tree_index);
}
} // trees loop
} // if skip < ntrees
} // TREES block loop
} // TAXA block loop
// No longer any need to store raw data from nexus file
nexusReader.DeleteBlocksFromFactories();
}
Finally, add the body of the showSummary
function, which reports the information stored in _treeIDs
in a couple of different ways. It first outputs the number of trees it read from the file. Next, it iterates through all distinct tree topologies stored in _treeIDs
, reporting the index of each tree having that topology. It finishes by producing a table showing the number of trees found for each distinct tree topology, sorted from most frequent topology to the least frequent topology.
inline void TreeSummary::showSummary() const {
// Produce some output to show that it works
std::cout << boost::str(boost::format("\nRead %d trees from file") % _newicks.size()) << std::endl;
// Show all unique topologies with a list of the trees that have that topology
// Also create a map that can be used to sort topologies by their sample frequency
typedef std::pair<unsigned, unsigned> sorted_pair_t;
std::vector< sorted_pair_t > sorted;
int t = 0;
for (auto & key_value_pair : _treeIDs) {
unsigned topology = ++t;
unsigned ntrees = (unsigned)key_value_pair.second.size();
sorted.push_back(std::pair<unsigned, unsigned>(ntrees,topology));
std::cout << "Topology " << topology << " seen in these " << ntrees << " trees:" << std::endl << " ";
std::copy(key_value_pair.second.begin(), key_value_pair.second.end(), std::ostream_iterator<unsigned>(std::cout, " "));
std::cout << std::endl;
}
// Show sorted histogram data
std::sort(sorted.begin(), sorted.end());
//unsigned npairs = (unsigned)sorted.size();
std::cout << "\nTopologies sorted by sample frequency:" << std::endl;
std::cout << boost::str(boost::format("%20s %20s") % "topology" % "frequency") << std::endl;
for (auto & ntrees_topol_pair : boost::adaptors::reverse(sorted)) {
unsigned n = ntrees_topol_pair.first;
unsigned t = ntrees_topol_pair.second;
std::cout << boost::str(boost::format("%20d %20d") % t % n) << std::endl;
}
}