Add data members _allow_polytomies
, _topo_prior_C
, _resolution_class_prior
to the Strom
class as shown below to allow the user to (1) choose whether to allow polytomies, (2) set the C parameter of the topology prior, and (3) choose whether the topology prior represents a resolution class or standard polytomy prior.
#pragma once
#include <iostream>
#include "data.hpp"
#include "likelihood.hpp"
#include "tree_summary.hpp"
#include "partition.hpp"
#include "lot.hpp"
#include "chain.hpp"
#include "output_manager.hpp"
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
namespace strom {
class Strom {
public:
Strom();
~Strom();
void clear();
void processCommandLineOptions(int argc, const char * argv[]);
void run();
private:
bool processAssignmentString(Model::SharedPtr m, const std::string & which, const std::string & definition);
void handleAssignmentStrings(Model::SharedPtr m, const boost::program_options::variables_map & vm, std::string label, const std::vector<std::string> & definitions, std::string default_definition);
bool splitAssignmentString(const std::string & definition, std::vector<std::string> & vector_of_subset_names, std::vector<double> & vector_of_values);
void sample(unsigned iter, Chain & chain);
void readData();
void readTrees();
void showPartitionInfo();
void showBeagleInfo();
void showMCMCInfo();
void calcHeatingPowers();
void initChains();
void startTuningChains();
void stopTuningChains();
void stepChains(unsigned iteration, bool sampling);
void swapChains();
void stopChains();
void swapSummary() const;
void showChainTuningInfo() const;
double _expected_log_likelihood;
<span style="color:#0000ff"><strong>double _topo_prior_C;</strong></span>
<span style="color:#0000ff"><strong>bool _allow_polytomies;</strong></span>
<span style="color:#0000ff"><strong>bool _resolution_class_prior;</strong></span>
std::string _data_file_name;
std::string _tree_file_name;
Partition::SharedPtr _partition;
Data::SharedPtr _data;
std::vector<Likelihood::SharedPtr> _likelihoods;
TreeSummary::SharedPtr _tree_summary;
Lot::SharedPtr _lot;
unsigned _random_seed;
unsigned _num_iter;
unsigned _print_freq;
unsigned _sample_freq;
unsigned _num_burnin_iter;
bool _using_stored_data;
bool _use_gpu;
bool _ambig_missing;
unsigned _num_chains;
double _heating_lambda;
std::vector<Chain> _chains;
std::vector<double> _heating_powers;
std::vector<unsigned> _swaps;
bool _use_underflow_scaling;
static std::string _program_name;
static unsigned _major_version;
static unsigned _minor_version;
OutputManager::SharedPtr _output_manager;
};
Initialize the 3 new data members in the Strom::clear
function
inline void Strom::clear() {
_data_file_name = "";
_tree_file_name = "";
_tree_summary = nullptr;
_partition.reset(new Partition());
_use_gpu = true;
_ambig_missing = true;
_expected_log_likelihood = 0.0;
_data = nullptr;
_use_underflow_scaling = false;
_lot = nullptr;
_random_seed = 1;
_num_iter = 1000;
_print_freq = 1;
_sample_freq = 1;
_output_manager = nullptr;
<span style="color:#0000ff"><strong>_topo_prior_C = 1.0;</strong></span>
<span style="color:#0000ff"><strong>_allow_polytomies = true;</strong></span>
<span style="color:#0000ff"><strong>_resolution_class_prior = true;</strong></span>
_using_stored_data = true;
_likelihoods.clear();
_num_burnin_iter = 1000;
_heating_lambda = 0.5;
_num_chains = 1;
_chains.resize(0);
_heating_powers.resize(0);
_swaps.resize(0);
}
Provide settings for the new data members to allow the user to modify them in processCommandLineOptions
.
inline void Strom::processCommandLineOptions(int argc, const char * argv[]) {
std::vector<std::string> partition_statefreq;
std::vector<std::string> partition_rmatrix;
std::vector<std::string> partition_omega;
std::vector<std::string> partition_ratevar;
std::vector<std::string> partition_pinvar;
std::vector<std::string> partition_ncateg;
std::vector<std::string> partition_subsets;
std::vector<std::string> partition_relrates;
std::vector<std::string> partition_tree;
boost::program_options::variables_map vm;
boost::program_options::options_description desc("Allowed options");
desc.add_options()
("help,h", "produce help message")
("version,v", "show program version")
("seed,z", boost::program_options::value(&_random_seed)->default_value(1), "pseudorandom number seed")
("niter,n", boost::program_options::value(&_num_iter)->default_value(1000), "number of MCMC iterations")
("printfreq", boost::program_options::value(&_print_freq)->default_value(1), "skip this many iterations before reporting progress")
("samplefreq", boost::program_options::value(&_sample_freq)->default_value(1), "skip this many iterations before sampling next")
("datafile,d", boost::program_options::value(&_data_file_name)->required(), "name of a data file in NEXUS format")
("treefile,t", boost::program_options::value(&_tree_file_name)->required(), "name of a tree file in NEXUS format")
("subset", boost::program_options::value(&partition_subsets), "a string defining a partition subset, e.g. 'first:1-1234\3' or 'default[codon:standard]:1-3702'")
("ncateg,c", boost::program_options::value(&partition_ncateg), "number of categories in the discrete Gamma rate heterogeneity model")
("statefreq", boost::program_options::value(&partition_statefreq), "a string defining state frequencies for one or more data subsets, e.g. 'first,second:0.1,0.2,0.3,0.4'")
("omega", boost::program_options::value(&partition_omega), "a string defining the nonsynonymous/synonymous rate ratio omega for one or more data subsets, e.g. 'first,second:0.1'")
("rmatrix", boost::program_options::value(&partition_rmatrix), "a string defining the rmatrix for one or more data subsets, e.g. 'first,second:1,2,1,1,2,1'")
("ratevar", boost::program_options::value(&partition_ratevar), "a string defining the among-site rate variance for one or more data subsets, e.g. 'first,second:2.5'")
("pinvar", boost::program_options::value(&partition_pinvar), "a string defining the proportion of invariable sites for one or more data subsets, e.g. 'first,second:0.2'")
("relrate", boost::program_options::value(&partition_relrates), "a string defining the (unnormalized) relative rates for all data subsets (e.g. 'default:3,1,6').")
("tree", boost::program_options::value(&partition_tree), "the index of the tree in the tree file (first tree has index = 1)")
<span style="color:#0000ff"><strong>("topopriorC", boost::program_options::value(&_topo_prior_C)->default_value(1.0), "topology prior C: tree (or resolution class) with m internal nodes has probability C time greater than tree (or resolution class) with m+1 internal nodes.")</strong></span>
<span style="color:#0000ff"><strong>("allowpolytomies", boost::program_options::value(&_allow_polytomies)->default_value(true), "yes or no; if yes, then topopriorC and polytomyprior are used, otherwise topopriorC and polytomyprior are ignored")</strong></span>
<span style="color:#0000ff"><strong>("resclassprior", boost::program_options::value(&_resolution_class_prior)->default_value(true), "if yes, topologypriorC will apply to resolution classes; if no, topologypriorC will apply to individual tree topologies")</strong></span>
("expectedLnL", boost::program_options::value(&_expected_log_likelihood)->default_value(0.0), "log likelihood expected")
("nchains", boost::program_options::value(&_num_chains)->default_value(1), "number of chains")
("heatfactor", boost::program_options::value(&_heating_lambda)->default_value(0.5), "determines how hot the heated chains are")
("burnin", boost::program_options::value(&_num_burnin_iter)->default_value(100), "number of iterations used to burn in chains")
("usedata", boost::program_options::value(&_using_stored_data)->default_value(true), "use the stored data in calculating likelihoods (specify no to explore the prior)")
("gpu", boost::program_options::value(&_use_gpu)->default_value(true), "use GPU if available")
("ambigmissing", boost::program_options::value(&_ambig_missing)->default_value(true), "treat all ambiguities as missing data")
("underflowscaling", boost::program_options::value(&_use_underflow_scaling)->default_value(true), "scale site-likelihoods to prevent underflow (slower but safer)")
;
Modify the sample
function to output the resolution class of sampled trees in the parameters file.
inline void Strom::sample(unsigned iteration, Chain & chain) {
if (chain.getHeatingPower() < 1.0)
return;
bool time_to_sample = (bool)(iteration % _sample_freq == 0);
bool time_to_report = (bool)(iteration % _print_freq == 0);
if (time_to_sample || time_to_report) {
double logLike = chain.getLogLikelihood();
double logPrior = chain.calcLogJointPrior();
double TL = chain.getTreeManip()->calcTreeLength();
<span style="color:#0000ff"><strong>unsigned m = chain.getTreeManip()->calcResolutionClass();</strong></span>
if (time_to_report) {
if (logPrior == Updater::getLogZero())
_output_manager->outputConsole(boost::str(boost::format("%12d %12d %12.5f %12s %12.5f") % iteration % m % logLike % "-infinity" % TL));
else
_output_manager->outputConsole(boost::str(boost::format("%12d %12d %12.5f %12.5f %12.5f") % iteration % m % logLike % logPrior % TL));
}
if (time_to_sample) {
_output_manager->outputTree(iteration, chain.getTreeManip());
<span style="color:#0000ff"><strong>_output_manager->outputParameters(iteration, logLike, logPrior, TL, m, chain.getModel());</strong></span>
}
}
}
Modify the initChains
function to notify chains of the new settings via their Model
.
inline void Strom::initChains() {
// Create _num_chains chains
_chains.resize(_num_chains);
// Create _num_chains by _num_chains swap matrix
_swaps.assign(_num_chains*_num_chains, 0);
// Create heating power vector
_heating_powers.assign(_num_chains, 1.0);
calcHeatingPowers();
// Initialize chains
for (unsigned chain_index = 0; chain_index < _num_chains; ++chain_index) {
auto & c = _chains[chain_index];
auto likelihood = _likelihoods[chain_index];
auto m = likelihood->getModel();
// Finish setting up models
<span style="color:#0000ff"><strong>m->setTopologyPriorOptions(_allow_polytomies, _resolution_class_prior, _topo_prior_C);</strong></span>
m->setSubsetNumPatterns(_data->calcNumPatternsVect());
m->setSubsetSizes(_partition->calcSubsetSizes());
m->activate();
if (chain_index == 0)
std::cout << "\n" << m->describeModel() << std::endl;
else
m->describeModel();
// Finish setting up likelihoods
likelihood->setData(_data);
likelihood->useUnderflowScaling(_use_underflow_scaling);
likelihood->initBeagleLib();
likelihood->useStoredData(_using_stored_data);
// Build list of updaters, one for each free parameter in the model
unsigned num_free_parameters = c.createUpdaters(m, _lot, likelihood);
if (num_free_parameters == 0)
throw XStrom("MCMC skipped because there are no free parameters in the model");
// Tell the chain that it should adapt its updators (at least initially)
c.startTuning();
// Set heating power to precalculated value
c.setChainIndex(chain_index);
c.setHeatingPower(_heating_powers[chain_index]);
// Give the chain a starting tree
std::string newick = _tree_summary->getNewick(m->getTreeIndex());
c.setTreeFromNewick(newick);
// Print headers in output files and make sure each updator has its starting value
c.start();
}
}