barry: Your go-to motif accountant  0.0-1
Full enumeration of sample space and fast count of sufficient statistics for binary arrays
geese-meat-likelihood.hpp
Go to the documentation of this file.
1 #ifndef GEESE_MEAT_LIKELIHOOD_HPP
2 #define GEESE_MEAT_LIKELIHOOD_HPP 1
3 
4 #include "geese-bones.hpp"
5 
6 inline void pset_loop(
7  size_t n,
8  size_t s,
9  size_t nfunctions,
10  const size_t node_id,
11  const size_t array_id,
12  std::vector< double > & totprob_n,
13  const std::vector< double > & par0,
14  const std::vector<std::vector<bool>> & states,
15  const std::vector< PhyloArray > & psets,
16  const std::vector< std::vector< size_t > > & locations,
17  const std::vector<geese::Node *> & node_offspring,
18  const double * psetprobs
19 )
20 {
21  // Retrieving the pset
22  const auto & x = psets[n];
23 
24  if (!x.is_dense())
25  throw std::logic_error("This is only supported for dense arrays.");
26 
27  const size_t * location_x = &locations[n][0u];
28 
29  // Extracting the possible values of each offspring
30  double off_mult = 1.0;
31 
32  for (auto o = 0u; o < x.ncol(); ++o)
33  {
34 
35  // Setting the node
36  const Node * n_off = node_offspring[o];
37 
38  // In the case that the offspring is a leaf, then we need to
39  // check whether the state makes sense.
40  if (n_off->is_leaf())
41  {
42  for (auto f = 0u; f < nfunctions; ++f)
43  {
44  if (n_off->annotations[f] != 9u)
45  {
46 
47  if (x(f, o) != n_off->annotations[f])
48  {
49 
50  off_mult = -1.0;
51  break;
52 
53  }
54 
55  }
56 
57  }
58 
59  // Going out
60  if (off_mult < 0)
61  break;
62 
63  continue;
64 
65  }
66 
67  // Retrieving the location to the respective set of probabilities
68  off_mult *= n_off->subtree_prob[*(location_x + o)];
69 
70  }
71 
72  // Is this state valid?
73  if (off_mult < 0.0)
74  return;
75 
76  // Use try catch in the following line
77  try {
78 
79  off_mult *= *(psetprobs + n);
80 
81  } catch (std::exception & e) {
82 
83  auto err = std::string(e.what());
84 
85  std::string state_str = "";
86  for (const auto & ss : states[s])
87  state_str += std::to_string(ss) + " ";
88 
89  err = "Error computing the likelihood at node " +
90  std::to_string(node_id) + " with state " + state_str +
91  ". Error message:\n" +
92  err;
93 
94  throw std::runtime_error(err);
95 
96  }
97 
98  // Adding to the total probabilities
99  totprob_n[n] = off_mult;
100 
101 }
102 
103 inline double Geese::likelihood(
104  const std::vector< double > & par,
105  bool as_log,
106  bool use_reduced_sequence,
107  size_t ncores,
108  bool no_update_pset_probs
109 ) {
110 
111  // Checking whether the model is initialized
112  INITIALIZED()
113 
114  // Splitting the probabilities
115  std::vector< double > par0(par.begin(), par.end() - nfunctions);
116  std::vector< double > par_root(par.end() - nfunctions, par.end());
117 
118  // Scaling root
119  for (auto& p : par_root)
120  p = std::exp(p)/(std::exp(p) + 1);
121 
122  double ll = 0.0;
123 
124  // Updating normalizing constants
125  if (!no_update_pset_probs)
126  model->update_pset_probs(par0, ncores);
127 
128  // Following the prunning sequence
129  const std::vector< size_t > & preseq = use_reduced_sequence ?
130  this->reduced_sequence : this->sequence;
131 
132  // The first time it is called, it need to generate the corresponding
133  // hashes of the columns so it is fast to access then (saves time
134  // hashing and looking in the map.)
135  const auto & arrays2support = *(model->get_arrays2support());
136  const auto & psetprobs = *(model->get_pset_probs());
137  const auto & pset_locations = *(model->get_pset_locations());
138 
139  for (auto& i : preseq)
140  {
141 
142  // We cannot compute probability at the leaf, we need to continue
143  if (this->nodes[i].is_leaf())
144  continue;
145 
146  // Since we are using this a lot...
147  Node & node = nodes[i];
148  const size_t node_id = node.id;
149 
150  // Iterating through states
151  for (size_t s = 0u; s < states.size(); ++s)
152  {
153 
154  // Starting the prob
155  size_t array_id = node.narray[s];
156  size_t support_id = arrays2support[array_id];
157  const double * psetsprobs_s = &psetprobs[pset_locations[support_id]];
158 
159  // Retrieving the sets of arrays
160  const std::vector< PhyloArray > & psets =
161  *(model->get_pset(array_id));
162 
163  std::vector< std::vector< size_t > > & locations = pset_loc[support_id];
164 
165  // Making sure parallelization makes sense
166  if (psets.size() < 128)
167  ncores = 1u;
168 
169  // Summation over all possible values of X
170  const auto & node_offspring = node.offspring;
171  std::vector< double > totprob_n(psets.size(), 0.0);
172  for (size_t n = 0u; n < psets.size(); ++n)
173  {
174  pset_loop(
175  n, s, nfunctions, node_id, array_id, totprob_n,
176  par0, states, psets, locations,
177  node_offspring, psetsprobs_s
178  );
179  }
180 
181  // Setting the probability at the node
182  node.subtree_prob[s] = 0.0;
183  auto & nsp = node.subtree_prob[s];
184  #if defined(_OPENMP) || defined(__OPENMP)
185  #pragma omp simd reduction(+:nsp)
186  #endif
187  for (size_t n = 0u; n < psets.size(); ++n)
188  nsp += totprob_n[n];
189 
190 
191  }
192 
193  // All probabilities should be completed at this point
194  if (node.parent == nullptr)
195  {
196 
197  for (size_t s = 0u; s < states.size(); ++s)
198  {
199 
200  double tmpll = 1.0;
201 
202  for (auto k = 0u; k < nfunctions; ++k)
203  {
204 
205  tmpll *= states[s][k] ? par_root[k] : (1 - par_root[k]);
206 
207  }
208 
209  ll += tmpll * node.subtree_prob[s];
210 
211  }
212  }
213 
214  }
215 
216  // In the case that the sequence is empty, then it means
217  // that we are looking at a completely unnanotated tree,
218  // thus the likelihood should be one
219  if (preseq.size() == 0u)
220  return as_log ? -std::numeric_limits<double>::infinity() : 1.0;
221 
222 
223  return as_log ? std::log(ll) : ll;
224 
225 }
226 #endif
std::vector< size_t > reduced_sequence
std::vector< size_t > sequence
std::vector< std::vector< std::vector< size_t > > > pset_loc
Locations of columns.
size_t nfunctions
double likelihood(const std::vector< double > &par, bool as_log=false, bool use_reduced_sequence=true, size_t ncores=1u, bool no_update_pset_probs=false)
std::map< size_t, Node > nodes
A single node for the model.
std::vector< size_t > annotations
Observed annotations (only defined for Geese)
size_t id
Id of the node (as specified in the input)
std::vector< Node * > offspring
Offspring nodes.
std::vector< size_t > narray
ID of the array in the model.
Node * parent
Parent node.
bool is_leaf() const noexcept
std::vector< double > subtree_prob
Induced subtree probabilities.
size_t i
#define INITIALIZED()
Definition: geese-bones.hpp:22
void pset_loop(size_t n, size_t s, size_t nfunctions, const size_t node_id, const size_t array_id, std::vector< double > &totprob_n, const std::vector< double > &par0, const std::vector< std::vector< bool >> &states, const std::vector< PhyloArray > &psets, const std::vector< std::vector< size_t > > &locations, const std::vector< geese::Node * > &node_offspring, const double *psetprobs)