19.9 Modifying Strom

(Linux version)

< 19.8 | 19.9 | 19.10 >

Modifying Strom

Add data members _allow_polytomies, _topo_prior_C, _resolution_class_prior to the Strom class as shown below to allow the user to (1) choose whether to allow polytomies, (2) set the C parameter of the topology prior, and (3) choose whether the topology prior represents a resolution class or standard polytomy prior.

#pragma once    

#include &lt;iostream&gt;
#include "data.hpp"
#include "likelihood.hpp"
#include "tree_summary.hpp"
#include "partition.hpp"
#include "lot.hpp"
#include "chain.hpp"
#include "output_manager.hpp"
#include &lt;boost/program_options.hpp&gt;
#include &lt;boost/filesystem.hpp&gt;
#include &lt;boost/algorithm/string/split.hpp&gt;
#include &lt;boost/algorithm/string/classification.hpp&gt;

namespace strom {

    class Strom {
        public:
                                                    Strom();
                                                    ~Strom();

            void                                    clear();
            void                                    processCommandLineOptions(int argc, const char * argv[]);
            void                                    run();
                    
        private:
            bool                                    processAssignmentString(Model::SharedPtr m, const std::string & which, const std::string & definition);
            void                                    handleAssignmentStrings(Model::SharedPtr m, const boost::program_options::variables_map & vm, std::string label, const std::vector&lt;std::string&gt; & definitions, std::string default_definition); 
            bool                                    splitAssignmentString(const std::string & definition, std::vector&lt;std::string&gt; & vector_of_subset_names, std::vector&lt;double&gt;  & vector_of_values);
            void                                    sample(unsigned iter, Chain & chain);

            void                                    readData();
            void                                    readTrees();
            void                                    showPartitionInfo();
            void                                    showBeagleInfo();
            void                                    showMCMCInfo();
            void                                    calcHeatingPowers();
            void                                    initChains();
            void                                    startTuningChains();
            void                                    stopTuningChains();
            void                                    stepChains(unsigned iteration, bool sampling);
            void                                    swapChains();
            void                                    stopChains();
            void                                    swapSummary() const;
            void                                    showChainTuningInfo() const;

            double                                  _expected_log_likelihood;
            
            <span style="color:#0000ff"><strong>double                                  _topo_prior_C;</strong></span>
            <span style="color:#0000ff"><strong>bool                                    _allow_polytomies;</strong></span>
            <span style="color:#0000ff"><strong>bool                                    _resolution_class_prior;</strong></span>

            std::string                             _data_file_name;
            std::string                             _tree_file_name;
            Partition::SharedPtr                    _partition;

            Data::SharedPtr                         _data;
            std::vector&lt;Likelihood::SharedPtr&gt;      _likelihoods;
            TreeSummary::SharedPtr                  _tree_summary;
            Lot::SharedPtr                          _lot;

            unsigned                                _random_seed;
            unsigned                                _num_iter;
            unsigned                                _print_freq;
            unsigned                                _sample_freq;

            unsigned                                _num_burnin_iter; 
            bool                                    _using_stored_data;
            bool                                    _use_gpu;
            bool                                    _ambig_missing;
            unsigned                                _num_chains;
            double                                  _heating_lambda;
            std::vector&lt;Chain&gt;                      _chains;
            std::vector&lt;double&gt;                     _heating_powers;
            std::vector&lt;unsigned&gt;                   _swaps;

            bool                                    _use_underflow_scaling;

            static std::string                      _program_name;
            static unsigned                         _major_version;
            static unsigned                         _minor_version;
            
            OutputManager::SharedPtr                _output_manager;

    };
    

Initialize the 3 new data members in the Strom::clear function

    inline void Strom::clear() {    
        _data_file_name             = "";
        _tree_file_name             = "";
        _tree_summary               = nullptr;
        _partition.reset(new Partition());
        _use_gpu                    = true;
        _ambig_missing              = true;
        _expected_log_likelihood    = 0.0;
        _data                       = nullptr;
        _use_underflow_scaling      = false;
        _lot                        = nullptr;
        _random_seed                = 1;
        _num_iter                   = 1000;
        _print_freq                 = 1;
        _sample_freq                = 1;
        _output_manager             = nullptr;
        
        <span style="color:#0000ff"><strong>_topo_prior_C               = 1.0;</strong></span>
        <span style="color:#0000ff"><strong>_allow_polytomies           = true;</strong></span>
        <span style="color:#0000ff"><strong>_resolution_class_prior     = true;</strong></span>

        _using_stored_data          = true;
        _likelihoods.clear();
        _num_burnin_iter            = 1000;
        _heating_lambda             = 0.5;
        _num_chains                 = 1;
        _chains.resize(0);
        _heating_powers.resize(0);
        _swaps.resize(0);
    }   

Provide settings for the new data members to allow the user to modify them in processCommandLineOptions.

    inline void Strom::processCommandLineOptions(int argc, const char * argv[]) {   
        std::vector&lt;std::string&gt; partition_statefreq;
        std::vector&lt;std::string&gt; partition_rmatrix;
        std::vector&lt;std::string&gt; partition_omega;
        std::vector&lt;std::string&gt; partition_ratevar;
        std::vector&lt;std::string&gt; partition_pinvar;
        std::vector&lt;std::string&gt; partition_ncateg;
        std::vector&lt;std::string&gt; partition_subsets;
        std::vector&lt;std::string&gt; partition_relrates;
        std::vector&lt;std::string&gt; partition_tree;
        boost::program_options::variables_map vm;
        boost::program_options::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version,v", "show program version")
            ("seed,z",        boost::program_options::value(&_random_seed)-&gt;default_value(1),   "pseudorandom number seed")
            ("niter,n",       boost::program_options::value(&_num_iter)-&gt;default_value(1000),   "number of MCMC iterations")
            ("printfreq",  boost::program_options::value(&_print_freq)-&gt;default_value(1),   "skip this many iterations before reporting progress")
            ("samplefreq",  boost::program_options::value(&_sample_freq)-&gt;default_value(1),   "skip this many iterations before sampling next")
            ("datafile,d",  boost::program_options::value(&_data_file_name)-&gt;required(), "name of a data file in NEXUS format")
            ("treefile,t",  boost::program_options::value(&_tree_file_name)-&gt;required(), "name of a tree file in NEXUS format")
            ("subset",  boost::program_options::value(&partition_subsets), "a string defining a partition subset, e.g. 'first:1-1234\3' or 'default[codon:standard]:1-3702'")
            ("ncateg,c", boost::program_options::value(&partition_ncateg), "number of categories in the discrete Gamma rate heterogeneity model")
            ("statefreq", boost::program_options::value(&partition_statefreq), "a string defining state frequencies for one or more data subsets, e.g. 'first,second:0.1,0.2,0.3,0.4'")
            ("omega", boost::program_options::value(&partition_omega), "a string defining the nonsynonymous/synonymous rate ratio omega for one or more data subsets, e.g. 'first,second:0.1'")
            ("rmatrix", boost::program_options::value(&partition_rmatrix), "a string defining the rmatrix for one or more data subsets, e.g. 'first,second:1,2,1,1,2,1'")
            ("ratevar", boost::program_options::value(&partition_ratevar), "a string defining the among-site rate variance for one or more data subsets, e.g. 'first,second:2.5'")
            ("pinvar", boost::program_options::value(&partition_pinvar), "a string defining the proportion of invariable sites for one or more data subsets, e.g. 'first,second:0.2'")
            ("relrate", boost::program_options::value(&partition_relrates), "a string defining the (unnormalized) relative rates for all data subsets (e.g. 'default:3,1,6').")
            ("tree", boost::program_options::value(&partition_tree), "the index of the tree in the tree file (first tree has index = 1)")
            <span style="color:#0000ff"><strong>("topopriorC", boost::program_options::value(&_topo_prior_C)-&gt;default_value(1.0), "topology prior C: tree (or resolution class) with m internal nodes has probability C time greater than tree (or resolution class) with m+1 internal nodes.")</strong></span>
            <span style="color:#0000ff"><strong>("allowpolytomies", boost::program_options::value(&_allow_polytomies)-&gt;default_value(true), "yes or no; if yes, then topopriorC and polytomyprior are used, otherwise topopriorC and polytomyprior are ignored")</strong></span>
            <span style="color:#0000ff"><strong>("resclassprior", boost::program_options::value(&_resolution_class_prior)-&gt;default_value(true), "if yes, topologypriorC will apply to resolution classes; if no, topologypriorC will apply to individual tree topologies")</strong></span>
            ("expectedLnL", boost::program_options::value(&_expected_log_likelihood)-&gt;default_value(0.0), "log likelihood expected")
            ("nchains",       boost::program_options::value(&_num_chains)-&gt;default_value(1),                "number of chains")
            ("heatfactor",    boost::program_options::value(&_heating_lambda)-&gt;default_value(0.5),          "determines how hot the heated chains are")
            ("burnin",        boost::program_options::value(&_num_burnin_iter)-&gt;default_value(100),         "number of iterations used to burn in chains")
            ("usedata",       boost::program_options::value(&_using_stored_data)-&gt;default_value(true),      "use the stored data in calculating likelihoods (specify no to explore the prior)")
            ("gpu",           boost::program_options::value(&_use_gpu)-&gt;default_value(true),                "use GPU if available")
            ("ambigmissing",  boost::program_options::value(&_ambig_missing)-&gt;default_value(true),          "treat all ambiguities as missing data")
            ("underflowscaling",  boost::program_options::value(&_use_underflow_scaling)-&gt;default_value(true),          "scale site-likelihoods to prevent underflow (slower but safer)")
        ;   

Modify the sample function to output the resolution class of sampled trees in the parameters file.

    inline void Strom::sample(unsigned iteration, Chain & chain) {  
        if (chain.getHeatingPower() &lt; 1.0)
            return;
            
        bool time_to_sample = (bool)(iteration % _sample_freq == 0);
        bool time_to_report = (bool)(iteration % _print_freq == 0);
        if (time_to_sample || time_to_report) {
            double logLike = chain.getLogLikelihood();
            double logPrior = chain.calcLogJointPrior();
            double TL = chain.getTreeManip()-&gt;calcTreeLength();
            <span style="color:#0000ff"><strong>unsigned m = chain.getTreeManip()-&gt;calcResolutionClass();</strong></span>
            if (time_to_report) {
                if (logPrior == Updater::getLogZero())
                    _output_manager-&gt;outputConsole(boost::str(boost::format("%12d %12d %12.5f %12s %12.5f") % iteration % m % logLike % "-infinity" % TL));
                else
                    _output_manager-&gt;outputConsole(boost::str(boost::format("%12d %12d %12.5f %12.5f %12.5f") % iteration % m % logLike % logPrior % TL));
            }
            if (time_to_sample) {
                _output_manager-&gt;outputTree(iteration, chain.getTreeManip());
                <span style="color:#0000ff"><strong>_output_manager-&gt;outputParameters(iteration, logLike, logPrior, TL, m, chain.getModel());</strong></span>
            }
        }
    }   

Modify the initChains function to notify chains of the new settings via their Model.

    inline void Strom::initChains() {   
        // Create _num_chains chains
        _chains.resize(_num_chains);
        
        // Create _num_chains by _num_chains swap matrix
        _swaps.assign(_num_chains*_num_chains, 0);

        // Create heating power vector
        _heating_powers.assign(_num_chains, 1.0);
        calcHeatingPowers();
        
        // Initialize chains
        for (unsigned chain_index = 0; chain_index &lt; _num_chains; ++chain_index) {
            auto & c        = _chains[chain_index];
            auto likelihood = _likelihoods[chain_index];
            auto m          = likelihood-&gt;getModel();
            
            // Finish setting up models
            <span style="color:#0000ff"><strong>m-&gt;setTopologyPriorOptions(_allow_polytomies, _resolution_class_prior, _topo_prior_C);</strong></span>
            m-&gt;setSubsetNumPatterns(_data-&gt;calcNumPatternsVect());
            m-&gt;setSubsetSizes(_partition-&gt;calcSubsetSizes());
            m-&gt;activate();
            if (chain_index == 0)
                std::cout &lt;&lt; "\n" &lt;&lt; m-&gt;describeModel() &lt;&lt; std::endl;
            else
                m-&gt;describeModel();
                
            // Finish setting up likelihoods
            likelihood-&gt;setData(_data);
            likelihood-&gt;useUnderflowScaling(_use_underflow_scaling);
            likelihood-&gt;initBeagleLib();
            likelihood-&gt;useStoredData(_using_stored_data);
            
            // Build list of updaters, one for each free parameter in the model
            unsigned num_free_parameters = c.createUpdaters(m, _lot, likelihood);   
            if (num_free_parameters == 0)
                throw XStrom("MCMC skipped because there are no free parameters in the model");

            // Tell the chain that it should adapt its updators (at least initially)
            c.startTuning();

            // Set heating power to precalculated value
            c.setChainIndex(chain_index);
            c.setHeatingPower(_heating_powers[chain_index]);
                        
            // Give the chain a starting tree
            std::string newick = _tree_summary-&gt;getNewick(m-&gt;getTreeIndex());
            c.setTreeFromNewick(newick);

            // Print headers in output files and make sure each updator has its starting value
            c.start();
        }
    }   

< 19.8 | 19.9 | 19.10 >