barry: Your go-to motif accountant  0.0-1
Full enumeration of sample space and fast count of sufficient statistics for binary arrays
geese-meat-predict.hpp
Go to the documentation of this file.
1 // #include "geese-bones.hpp"
2 
3 #ifndef GEESE_MEAT_PREDICT_HPP
4 #define GEESE_MEAT_PREDICT_HPP 1
5 
6 inline std::vector< std::vector<double> > Geese::predict_backend(
7  const std::vector< double > & par,
8  bool use_reduced_sequence,
9  const std::vector< size_t > & preorder
10 )
11 {
12 
13 
14  // Splitting the probabilities
15  std::vector< double > par_terms(par.begin(), par.end() - nfuns());
16  std::vector< double > par_root(par.end() - nfuns(), par.end());
17 
18  // Scaling root
19  for (auto& p : par_root)
20  p = std::exp(p)/(std::exp(p) + 1);
21 
22  // Generating probabilities at the root-level (root state)
23  std::vector< double > rootp(this->states.size(), 1.0);
24  for (size_t s = 0u; s < rootp.size(); ++s)
25  {
26 
27  for (size_t f = 0u; f < nfuns(); ++f)
28  rootp[s] *= states[s][f] ? par_root[f] : (1.0 - par_root[f]);
29 
30  }
31 
32  // Making room
33  std::vector< std::vector<double> > res(
34  nnodes(), std::vector<double>(nfuns())
35  );
36 
37  // Step 1: Computing the probability at the root node
38  std::vector< double > tmp_prob(nfuns(), 0.0);
39  size_t root_id = preorder[0u];
40  Node * tmp_node = &nodes[root_id];
41  tmp_node->probability.resize(states.size(), 0.0);
42  double tmp_likelihood = likelihood(par, false, use_reduced_sequence, 1u, true);
43 
44  if (!std::isfinite(tmp_likelihood))
45  {
46  throw std::runtime_error("Likelihood is not finite");
47  }
48 
49  for (size_t s = 0u; s < states.size(); ++s)
50  {
51 
52  // Overall state probability P(x_s | D)
53  tmp_node->probability[s] = tmp_node->subtree_prob[s] * rootp[s] /
54  tmp_likelihood;
55 
56  if (!std::isfinite(tmp_node->probability[s]))
57  {
58  throw std::runtime_error("Probability is not finite");
59  }
60 
61  // Marginalizing the probabilities P(x_sf | D)
62  for (size_t f = 0u; f < nfuns(); ++f)
63  {
64 
65  // Since the probability, the expected value, is for
66  // observing an x = 1, then we need to make sure that we
67  // are multiplying by the corresponding state
68  if (states[s][f])
69  tmp_prob[f] += tmp_node->probability[s];
70 
71  }
72 
73  }
74 
75  // Storing the final prob
76  res[nodes[root_id].ord] = tmp_prob;
77 
78  // Retrieving the powersets probabilities
79  const auto & pset_probs = *(model->get_pset_probs());
80  const auto & arrays2support = *(model->get_arrays2support());
81  const auto & pset_locations = *(model->get_pset_locations());
82 
83  // This will start from the root node and go down
84  for (auto & i : preorder)
85  {
86 
87  // Leafs have nothing to do here
88  Node & parent = nodes[i];
89  if (parent.is_leaf())
90  continue;
91 
92  // Creating space.
93  std::vector< std::vector< double > > everything_below;
94  std::vector< std::vector< double > > everything_above;
95 
96  everything_below.reserve(states.size());
97  everything_above.reserve(states.size());
98 
99  // All combinations of the the parent states
100  // So psets[s] = combinations of offspring given state s.
101  // psets[s][i] = The ith combination of offspring given state s.
102  std::vector< std::vector< PhyloArray > > psets;
103  psets.reserve(states.size());
104 
105  // Making space for the offspring
106  for (auto & off : parent.offspring)
107  {
108  off->probability.resize(states.size(), 0.0);
109  std::fill(off->probability.begin(), off->probability.end(), 0.0);
110  }
111 
112  // Iterating through the parent states
113  for (size_t s = 0u; s < states.size(); ++s)
114  {
115  std::vector< double > below;
116  std::vector< double > above;
117  std::vector< PhyloArray > pset;
118 
119  // Array id in the support and support id of the array
120  size_t array_id = parent.narray[s];
121  size_t support_id = arrays2support[array_id];
122 
123  // Retrieving powerset of stats and arrays
124  const auto & pset_arrays = model->get_pset(array_id);
125 
126  // Going over all possible combinations given parent is state s
127  for (size_t p = 0u; p < pset_arrays->size(); ++p)
128  {
129 
130  // Corresponding graph and target stats
131  const PhyloArray & array_p = pset_arrays->at(p);
132 
133  // Adding to the map, we only do this during the first run,
134  // afterwards, we need to actually look for the array.
135  bool in_the_set = true;
136 
137  // Everything below just need to be computed only once
138  // and thus, if already added, no need to go through all of this!
139  double everything_below_p = 1.0;
140  for (size_t off = 0u; off < parent.offspring.size(); ++off)
141  {
142 
143  // Below leafs, the everything below is 1.
144  if (parent.offspring[off]->is_leaf())
145  {
146 
147  // But we can only includ it if the current state actually
148  // matches the leaf data (otherwise the prob is 0)
149  const auto & off_ann = parent.offspring[off]->annotations;
150  for (size_t f = 0u; f < nfuns(); ++f)
151  {
152 
153  if ((off_ann[f] != 9u) && (off_ann[f] != array_p(f, off)))
154  {
155  in_the_set = false;
156  break;
157  }
158 
159  }
160 
161  if (!in_the_set)
162  break;
163 
164  continue;
165 
166  } else {
167 
168  // Getting the offspring state, and how it maps, only
169  // if it is not an offspring
170  const auto & off_state = array_p.get_col_vec(off);
171  size_t loc = this->map_to_state_id.find(off_state)->second;
172 
173  everything_below_p *= parent.offspring[off]->subtree_prob[loc];
174 
175  }
176 
177  }
178 
179  // If it is not in the set, then continue to the next array
180  if (!in_the_set)
181  continue;
182 
183  pset.push_back(array_p); // Generating a copy
184 
185  // - With focal node, conditioning on it beening status s.
186  // - But the offspring probabilities are the central ones here.
187  // - So the saved values are for computing P(x_offspring | Data)
188  below.push_back(everything_below_p);
189 
190  // The first run, we only need to grow the list
191  above.push_back(
192  pset_probs[pset_locations[support_id] + p]
193  // model->likelihood(
194  // par_terms, target_p, parent.narray[s], false
195  // )
196  * parent.probability[s] / parent.subtree_prob[s]
197  );
198 
199 
200  } // end for psets
201 
202  // Storing the psets
203  psets.push_back(std::move(pset));
204  everything_below.push_back(std::move(below));
205  everything_above.push_back(std::move(above));
206 
207 
208  } // end for states
209 
210  // Marginalizing at the state level for each offspring
211  for (size_t s = 0u; s < states.size(); ++s)
212  {
213 
214  for (size_t p = 0u; p < everything_above[s].size(); ++p)
215  {
216 
217  // p-th pset
218  const auto & pset_p = psets[s][p];
219 
220  // Updating the probability (it is the product)
221  everything_above[s][p] *= everything_below[s][p];
222 
223  for (size_t off = 0u; off < parent.offspring.size(); ++off)
224  {
225 
226  // Figuring out the state of the offspring
227  auto cvec = pset_p.get_col_vec(off);
228  size_t off_s = this->map_to_state_id[cvec];
229  parent.offspring[off]->probability[off_s] += everything_above[s][p];
230 
231  // We integrate over the offspring itsefl
232  for (size_t f = 0u; f < nfuns(); ++f)
233  {
234  if (cvec[f] == 1u)
235  res[parent.offspring[off]->ord][f] += everything_above[s][p];
236  }
237 
238 
239 
240  }
241 
242  }
243  }
244 
245  // Finally, we can marginalize the values at the
246  // gene function level.
247  for (const auto & off : parent.offspring)
248  {
249 
250  // Checking that probabilities add up to one
251  for (size_t f = 0u; f < nfuns(); ++f)
252  {
253  if ((res[off->ord][f] > 1.00001) || (res[off->ord][f] < -.0000009))
254  {
255  auto msg = "[geese] Out-of-range probability for node.id " +
256  std::to_string(off->id) + " for function " +
257  std::to_string(f) + ": " +
258  std::to_string(res[off->ord][f]);
259 
260  throw std::logic_error(msg);
261 
262  }
263 
264  if (res[off->ord][f] > 1.0)
265  res[off->ord][f] = 1.0;
266  else if (res[off->ord][f] < 0.0)
267  res[off->ord][f] = 0.0;
268 
269  }
270 
271  }
272 
273  } // end for over preorder
274 
275  return res;
276 
277 }
278 
279 inline std::vector< std::vector<double> > Geese::predict(
280  const std::vector< double > & par,
281  std::vector< std::vector< double > > * res_prob,
282  bool leave_one_out,
283  bool only_annotated,
284  bool use_reduced_sequence
285 )
286 {
287 
288  INITIALIZED()
289 
290  // Inverse sequence
291  std::vector< size_t > preorder;
292  if (only_annotated)
293  preorder = this->reduced_sequence;
294  else
295  preorder = this->sequence;
296 
297  std::reverse(preorder.begin(), preorder.end());
298 
299  std::vector< double > par0(par.begin(), par.end() - nfuns());
300  model->update_pset_probs(par0, 1u);
301 
302  // Full prediction (first run, right now I am doing this
303  // twice. Need to fix in the future)
304  std::vector< std::vector<double> > res = predict_backend(
305  par, use_reduced_sequence, preorder
306  );
307 
308  // If the user requires the probability matrix per state
309  if (res_prob != nullptr)
310  {
311 
312  res_prob->resize(nnodes());
313  for (auto& i : sequence)
314  res_prob->at(nodes[i].ord) = nodes[i].probability;
315 
316  }
317 
318  // In this case, we need to update the predictions, mostly of the annotated
319  // leaf nodes. Because of
320  if (leave_one_out)
321  {
322 
323  std::vector< size_t > default_empty(nfuns(), 9u);
324  for (auto& n : nodes)
325  {
326 
327  if (n.second.is_leaf())
328  {
329 
330  Node & ntmp = n.second;
331 
332  // We only make the changes if it is not all missing
333  bool use_it = false;
334  for (auto& n_state : ntmp.annotations)
335  if (n_state != 9u)
336  {
337 
338  use_it = true;
339  break;
340 
341  }
342 
343 
344  if (!use_it)
345  continue;
346 
347  // Recording the original annotation
348  auto old_ann = ntmp.annotations;
349 
350  // Removing the entire gene
351  update_annotations(ntmp.id, default_empty);
352 
353  // Making the prediction
354  res[ntmp.ord] = (
355  predict_backend(par, use_reduced_sequence, preorder)
356  )[ntmp.ord];
357 
358  // Restoring the gene
359  update_annotations(ntmp.id, old_ann);
360 
361  if (res_prob != nullptr)
362  res_prob->operator[](ntmp.ord) = ntmp.probability;
363 
364  }
365 
366  }
367 
368  }
369 
370  return res;
371 
372 }
373 
374 #endif
std::vector< size_t > reduced_sequence
std::vector< size_t > sequence
std::vector< std::vector< double > > predict(const std::vector< double > &par, std::vector< std::vector< double > > *res_prob=nullptr, bool leave_one_out=false, bool only_annotated=false, bool use_reduced_sequence=true)
size_t nnodes() const noexcept
Number of nodes (interior + leaf)
Definition: geese-meat.hpp:437
double likelihood(const std::vector< double > &par, bool as_log=false, bool use_reduced_sequence=true, size_t ncores=1u, bool no_update_pset_probs=false)
void update_annotations(size_t nodeid, std::vector< size_t > newann)
Definition: geese-meat.hpp:288
std::map< size_t, Node > nodes
std::vector< std::vector< double > > predict_backend(const std::vector< double > &par, bool use_reduced_sequence, const std::vector< size_t > &preorder)
barry::MapVec_type< size_t > map_to_state_id
size_t nfuns() const noexcept
Number of functions analyzed.
Definition: geese-meat.hpp:430
A single node for the model.
std::vector< size_t > annotations
Observed annotations (only defined for Geese)
size_t id
Id of the node (as specified in the input)
std::vector< Node * > offspring
Offspring nodes.
std::vector< size_t > narray
ID of the array in the model.
std::vector< double > probability
The probability of observing each state.
size_t ord
Order in which the node was created.
bool is_leaf() const noexcept
std::vector< double > subtree_prob
Induced subtree probabilities.
return res
size_t i
#define INITIALIZED()
Definition: geese-bones.hpp:22
barry::BArrayDense< size_t, NodeData > PhyloArray