barry: Your go-to motif accountant  0.0-1
Full enumeration of sample space and fast count of sufficient statistics for binary arrays
geese-meat.hpp
Go to the documentation of this file.
1 // #include "geese-bones.hpp"
2 
3 #ifndef GEESE_MEAT_HPP
4 #define GEESE_MEAT_HPP 1
5 
6 inline void Geese::init_node(Node & n)
7 {
8 
9  // Creating the phyloarray, nfunctions x noffspring
10  n.array = PhyloArray(nfunctions, n.offspring.size());
11 
12  std::vector< bool > tmp_state = vector_caster<bool,size_t>(n.annotations);
13 
14  std::vector< double > blen(n.offspring.size(), 1.0);
15 
16  n.array.set_data(
17  new NodeData(blen, tmp_state, n.duplication),
18  true
19  );
20 
21  // We initialize all with a zero since, if excluded from the pruning process,
22  // We need to set it to one (as the result of the full integration).
23  n.subtree_prob.resize(states.size(), 1.0);
24 
25  // Adding the data, first through functions
26  for (size_t k = 0u; k < nfunctions; ++k)
27  {
28 
29  // Then through the offspring
30  size_t j = 0;
31  for (auto& o : n.offspring)
32  {
33 
34  // If leaf, then it may have an annotation
35  if (o->is_leaf())
36  {
37 
38  if (o->annotations[k] != 0)
39  n.array.insert_cell(k, j, o->annotations[k], false, false);
40 
41  }
42  else
43  {
44  // [2022-02-11]: (IMPORTANT COMMENT!)
45  // Otherwise, we fill it with a 0 so the support works correctly.
46  // When adding an array from the interior, we don't need to deal
47  // with the actual value as it is the powerset that matters. Using
48  // nine instead will block the cell and stop the routine for computing
49  // the values correctly
50  n.array.insert_cell(k, j, 9u, false, false);
51 
52  }
53 
54  ++j;
55 
56  }
57 
58  }
59 
60  // We then need to set the powerset
61  if (n.arrays.size() != states.size())
62  {
63 
64  n.arrays.resize(states.size());
65  n.narray.resize(states.size());
66 
67  }
68 
69  // Here we have an issue: Some transitions may not be right
70  // under the dynamic rules. So not all states can be valid.
71  // The arrays and narrays need to be updated once the model
72  // is initialized.
73  //
74  // The later is especially true for leaf nodes, where the
75  // limitations are not known until the model is initialized.
76  // PhyloStatsCounter stats_counter;
77  // stats_counter.set_counters(model->get_counters());
78  for (size_t s = 0u; s < states.size(); ++s)
79  {
80 
81  n.arrays[s] = PhyloArray(n.array, false);
82 
83  n.arrays[s].set_data(
84  new NodeData(blen, states[s], n.duplication),
85  true
86  );
87 
88  // Use try catch to run the following lines of code
89  // only if the array is valid.
90  try
91  {
92  n.narray[s] = model->add_array(n.arrays[s]);
93  }
94  catch (const std::exception & e)
95  {
96  auto err = std::string(e.what());
97 
98  err = "Array " + std::to_string(n.id) +
99  " cannot be added to the model with error:\n" + err +
100  "\n. This is likely due to a dynamic rule. " +
101  "The array to be added was in the following state:";
102 
103  std::string state_str = "";
104  for (const auto & ss : states[s])
105  state_str += std::to_string(ss) + " ";
106 
107  err += state_str + "\n";
108 
109  throw std::runtime_error(err);
110 
111  }
112 
113  }
114 
115  return;
116 
117 }
118 
119 inline Geese::~Geese() {
120 
121  if (delete_support)
122  delete model;
123 
124  if (delete_rengine)
125  delete rengine;
126 
127  return;
128 
129 }
130 
131 inline void Geese::init(size_t bar_width) {
132 
133  // Initializing the model, if it is null
134  if (this->model == nullptr)
135  {
136 
137  this->model = new PhyloModel();
138 
139  this->delete_support = true;
140  this->model->add_hasher(keygen_full);
141 
142  this->model->store_psets();
143 
144  // // Adding static rule
145  // rule_leafs(model->get_support_fun());
146 
147  }
148 
149  // Checking rseed, this is relevant when dealing with a flock. In the case of
150  // flock, both model and rengine are shared.
151  if (this->model->get_rengine() == nullptr)
152  this->model->set_rengine(this->rengine, false);
153 
154  // All combinations of the function
155  PhyloPowerSet pset(nfunctions, 1u);
156 
157  pset.calc();
158 
159  states.reserve(pset.data.size());
160 
161  size_t i = 0u;
162 
163  for (auto& iter : pset.data)
164  {
165 
166  states.emplace_back(std::vector< bool >(nfunctions, false));
167 
168  for (auto j = 0u; j < nfunctions; ++j)
169  {
170 
171  if (!iter.is_empty(j, 0u, false))
172  states[i][j] = true;
173 
174  }
175 
176  // Adding to map so we can look at it later on
177  map_to_state_id.insert({iter.get_col_vec(0u, false), i});
178 
179  i++;
180 
181  }
182 
183  if (bar_width > 0u)
184  {
185  printf_barry("Initializing nodes in Geese (this could take a while)...\n");
186 
187  barry::Progress prog_bar(this->nnodes(), bar_width);
188 
189  // Iterating throught the nodes
190  for (auto& iter : nodes)
191  {
192 
193  // Only parents get a node
194  if (!iter.second.is_leaf())
195  this->init_node(iter.second);
196 
197  prog_bar.next();
198 
199  }
200 
201  prog_bar.end();
202 
203 
204  }
205  else
206  {
207 
208  // Iterating throught the nodes
209  for (auto& iter : nodes)
210  {
211 
212  // Only parents get a node
213  if (!iter.second.is_leaf())
214  this->init_node(iter.second);
215 
216  }
217 
218  }
219 
220  // Resetting the sequence
221  for (auto& n: this->nodes)
222  n.second.visited = false;
223 
224  // The first time it is called, it need to generate the corresponding
225  // hashes of the columns so it is fast to access then (saves time
226  // hashing and looking in the map.)
227  auto sup_arrays = model->get_pset_arrays();
228 
229  pset_loc.resize(sup_arrays->size());
230  std::vector< size_t > tmpstate(nfunctions);
231 
232  for (auto s = 0u; s < sup_arrays->size(); ++s)
233  {
234 
235  auto sup_array = sup_arrays->operator[](s);
236  pset_loc[s].resize(sup_array.size());
237 
238  for (auto a = 0u; a < sup_array.size(); ++a)
239  {
240 
241  for (auto o = 0u; o < sup_array[a].ncol(); ++o)
242  {
243 
244  sup_array[a].get_col_vec(&tmpstate, o, false);
245  pset_loc[s][a].push_back(map_to_state_id[tmpstate]);
246 
247  }
248 
249  }
250 
251  }
252 
253  // So that others now know it was initialized
254  initialized = true;
255 
256  return;
257 
258 }
259 
260 inline void Geese::inherit_support(const Geese & model_, bool delete_support_)
261 {
262 
263  if (this->model != nullptr)
264  throw std::logic_error(
265  "There is already a -model- in this Geese. Cannot set a -model- after one is present."
266  );
267 
268  this->model = model_.model;
269 
270  this->delete_support = delete_support_;
271 
272  // And random number generation
273  if (this->delete_rengine)
274  {
275 
276  delete this->rengine;
277 
278  this->delete_rengine = false;
279 
280  }
281 
282  this->rengine = model_.rengine;
283 
284  return;
285 
286 }
287 
289  size_t nodeid,
290  std::vector< size_t > newann
291 ) {
292 
293  // This can only be done if it has been initialized
294  INITIALIZED()
295 
296  // Is this node present?
297  if (nodes.find(nodeid) == nodes.end())
298  throw std::length_error("The requested node is not present.");
299 
300  if (nodes[nodeid].annotations.size() != newann.size())
301  throw std::length_error("Incorrect length of the new annotations.");
302 
303  // Resetting the annotations, and updating the stats from the
304  // parent node
305  nodes[nodeid].annotations = newann;
306 
307  // This only makes sense (for now) if it is a tip
308  if (!nodes[nodeid].is_leaf())
309  return;
310 
311  init_node(*nodes[nodeid].parent);
312 
313  return;
314 
315 }
316 
317 inline void Geese::calc_sequence(Node * n)
318 {
319 
320  if (sequence.size() == nodes.size())
321  return;
322 
323  // First iteration
324  if (n == nullptr)
325  n = &(nodes.begin()->second);
326 
327  // Here before?
328  if (n->visited)
329  return;
330 
331  n->visited = true;
332 
333  if (!n->is_leaf())
334  {
335 
336  // iterating over its offspring, only if not there before
337  for (auto& it : n->offspring)
338  {
339 
340  if (!it->visited)
341  calc_sequence(it);
342 
343  }
344 
345  }
346 
347  // Now, adding to the list and going to its parent
348  sequence.push_back(n->id);
349 
350  if (n->parent == nullptr)
351  return;
352 
353  // Go to the parent iff not visited
354  if (!n->parent->visited)
355  calc_sequence(n->parent);
356 
357  return;
358 
359 }
360 
362 {
363 
364  // The criteria, if none of its decendants is annotated, then we can remove
365  // the node from the model
366  std::vector< bool > includeit(nodes.size(), false);
367 
368  for (auto& i : sequence)
369  {
370 
371  Node & n = nodes[i];
372 
373  // We will count this at the end
374  if (n.is_leaf())
375  {
376 
377  for (size_t k = 0u; k < nfuns(); ++k)
378  if (n.annotations[k] != 9u)
379  {
380 
381  includeit[n.ord] = true;
382  reduced_sequence.push_back(i);
383  break;
384 
385  }
386 
387  }
388  else
389  {
390 
391  // Checking, am I including any of my offspring?
392  for (auto& o : n.offspring)
393 
394  if (includeit[o->ord])
395  {
396 
397  includeit[n.ord] = true;
398  reduced_sequence.push_back(i);
399  break;
400 
401  }
402 
403  }
404 
405  }
406 
407 }
408 
409 inline std::vector< double > Geese::get_probabilities() const
410 {
411 
412  std::vector< double > res;
413 
414  res.reserve(
415  this->states.size() * nodes.size()
416  );
417 
418  for (auto& i : sequence)
419  {
420 
421  for (auto& p : this->nodes.at(i).subtree_prob)
422  res.push_back(p);
423 
424  }
425 
426  return res;
427 
428 }
429 
430 inline size_t Geese::nfuns() const noexcept
431 {
432 
433  return this->nfunctions;
434 
435 }
436 
437 inline size_t Geese::nnodes() const noexcept
438 {
439 
440  return this->nodes.size();
441 
442 }
443 
444 inline size_t Geese::nleafs() const noexcept
445 {
446 
447  size_t n = 0u;
448 
449  for (auto& iter : this->nodes)
450  if (iter.second.is_leaf())
451  n++;
452 
453  return n;
454 }
455 
456 inline size_t Geese::nterms() const
457 {
458 
459  return model->nterms() + this->nfuns();
460 
461 }
462 
463 inline size_t Geese::support_size() const noexcept
464 {
465 
466  if (model == nullptr)
467  return 0u;
468 
469  return model->support_size();
470 
471 }
472 
473 inline std::vector< size_t > Geese::nannotations() const noexcept
474 {
475 
476  std::vector< size_t > ans = {this->n_zeros, this->n_ones};
477 
478  return ans;
479 
480 }
481 
482 inline std::vector< std::string > Geese::colnames() const
483 {
484 
485  return this->model->colnames();
486 
487 }
488 
490  bool verb,
491  std::vector< size_t > * dist
492 ) const noexcept
493 {
494 
495  size_t largest = 0u;
496  for (const auto& n : this->nodes)
497  {
498 
499  if (n.second.is_leaf())
500  continue;
501 
502  size_t noff = n.second.noffspring();
503 
504  if (dist)
505  dist->push_back(noff);
506 
507  if (noff > 2u)
508  {
509 
510  if (verb)
511  printf_barry("Node id: %li has polytomy size %li\n", n.second.id, noff);
512 
513  }
514 
515  if (noff > largest)
516  largest = noff;
517 
518  }
519 
520  return largest;
521 
522 }
523 
524 inline std::vector< std::vector<double> > Geese::observed_counts()
525 {
526 
527  // Making room for the output
528  std::vector<std::vector<double>> ans;
529 
530  ans.reserve(nnodes());
531 
532  // Creating counter
533  PhyloStatsCounter tmpcount;
534 
535  tmpcount.set_counters(this->model->get_counters());
536 
537  // Iterating through the nodes
538  for (auto& n : nodes)
539  {
540 
541  if (n.second.is_leaf())
542  {
543 
544  ans.emplace_back(std::vector<double>({}));
545  continue;
546 
547  }
548 
549  PhyloArray tmparray(nfuns(), n.second.offspring.size());
550 
551  size_t j = 0u;
552 
553  for (auto& o : n.second.offspring)
554  {
555 
556  for (size_t k = 0u; k < nfuns(); ++k)
557  {
558 
559  if (o->annotations.at(k) != 0)
560  {
561 
562  tmparray.insert_cell(
563  k, j, o->annotations.at(k), false, false
564  );
565 
566  }
567 
568  }
569 
570  ++j;
571 
572  }
573 
574  std::vector< bool > tmp_state = vector_caster<bool,size_t>(
575  n.second.annotations
576  );
577 
578  std::vector< double > blen(n.second.offspring.size(), 1.0);
579 
580  tmparray.set_data(
581  new NodeData(blen, tmp_state, n.second.duplication),
582  true
583  );
584 
585  tmpcount.reset_array(&tmparray);
586 
587  ans.push_back(tmpcount.count_all());
588 
589  }
590 
591  return ans;
592 
593 }
594 
596 {
597 
598  // Making room for the output
599  std::vector<std::vector<double>> ans;
600  ans.reserve(nnodes());
601 
602  // Creating counter
603  PhyloStatsCounter tmpcount;
604  tmpcount.set_counters(this->model->get_counters());
605 
606  // Iterating through the nodes
607  for (auto& n : nodes) {
608 
609  if (n.second.is_leaf()) {
610  ans.emplace_back(std::vector< double >({}));
611  continue;
612  }
613 
614  PhyloArray tmparray(nfuns(), n.second.offspring.size());
615 
616  size_t j = 0u;
617  for (auto& o : n.second.offspring) {
618  for (size_t k = 0u; k < nfuns(); ++k) {
619  if (o->annotations.at(k) != 0) {
620  tmparray.insert_cell(
621  k, j, o->annotations.at(k), false, false
622  );
623  }
624  }
625  ++j;
626  }
627 
628  std::vector< bool > tmp_state =vector_caster<bool,size_t>(n.second.annotations);
629  std::vector< double > blen(n.second.offspring.size(), 1.0);
630  tmparray.set_data(
631  new NodeData(blen, tmp_state, n.second.duplication),
632  true
633  );
634 
635  tmpcount.reset_array(&tmparray);
636  std::vector< double > counts = tmpcount.count_all();
637 
638  // Printing
639  auto dpl = n.second.duplication ? "duplication" : "speciation";
640  printf_barry("----------\n");
641  printf_barry("nodeid: % 3li (%s)\nstate: [", n.second.id, dpl);
642  for (size_t f = 0u; f < nfuns(); ++f)
643  printf_barry("%i, ", (tmparray.D_ptr()->states[f] ? 1 : 0));
644 
645  printf_barry("]; Array:\n");
646  tmparray.print();
647  printf_barry("Counts: ");
648  for (auto& c : counts)
649  printf_barry("%.2f, ", c);
650  printf_barry("\n");
651 
652  }
653 
654  return;
655 
656 }
657 
658 inline void Geese::print() const
659 {
660 
661  // Information about the tree:
662  // - Number of functions
663  // - Number of nodes and leafs
664  // - Number of annotated leafs (0/1)
665  printf_barry("GEESE\nINFO ABOUT PHYLOGENY\n");
666  printf_barry("# of functions : %li\n", this->nfuns());
667  printf_barry("# of nodes [int; leaf] : [%li; %li]\n", this->nnodes() - this->nleafs(), this->nleafs());
668  printf_barry("# of ann. [zeros; ones] : [%li; %li]\n", this->n_zeros, this->n_ones);
669  printf_barry("# of events [dupl; spec] : [%li; %li]\n", this->n_dupl_events, this->n_spec_events);
670  printf_barry("Largest polytomy : %li\n", parse_polytomies(false));
671  printf_barry("\nINFO ABOUT THE SUPPORT\n");
672  this->model->print();
673 
674 }
675 
676 inline void Geese::print_nodes() const
677 {
678 
679  printf_barry("GEESE\nINFO ABOUT NODES\n");
680 
681  for (const auto & n: nodes)
682  {
683  printf_barry("% 4li - Id: %li -- ", n.second.ord, n.second.id);
684 
685  // Node type
686  printf_barry(
687  "node type: %s -- ",
688  n.second.is_leaf() ?
689  std::string("leaf").c_str() :
690  std::string("internal").c_str()
691  );
692 
693  // Event type
694  printf_barry(
695  "event type: %s -- ",
696  n.second.duplication ?
697  std::string("duplication").c_str() :
698  std::string("speciation").c_str()
699  );
700 
701  // Annotations
702  printf_barry("ann: [");
703  for (const auto & a: n.second.annotations)
704  {
705  // Print with ']' if last element
706  if (&a == &n.second.annotations.back())
707  {
708  printf_barry("%li] -- ", a);
709  }
710  else
711  {
712  printf_barry("%li, ", a);
713  }
714  }
715 
716  // Parent information
717  if (n.second.parent == nullptr)
718  {
719  printf_barry("parent id: (none) -- ");
720  } else {
721  printf_barry("parent id: %li -- ", n.second.parent->id);
722  }
723 
724  // Offspring information
725  if (n.second.offspring.size() > 0u)
726  {
727  printf_barry("off ids: [");
728  for (const auto & o: n.second.offspring)
729  {
730  // Same as in previous loop
731  if (&o == &n.second.offspring.back())
732  {
733  printf_barry("%li].", o->id);
734  }
735  else
736  {
737  printf_barry("%li, ", o->id);
738  }
739  }
740  }
741 
742  printf_barry("\n");
743 
744  }
745 
746 
747 }
748 
749 inline std::mt19937 * Geese::get_rengine()
750 {
751  return this->rengine;
752 }
753 
755 {
756  return this->model->get_counters();
757 }
758 
760  return this->model;
761 }
762 
764  return this->model->get_support_fun();
765 }
766 
767 inline std::vector< std::vector< bool > > Geese::get_states() const {
768  return this->states;
769 }
770 
771 inline std::vector< size_t > Geese::get_annotated_nodes() const {
772 
773  std::vector< size_t > ids(0u);
774  for (auto & n : nodes)
775  {
776 
777  // Counting non-9 annotations
778  for (size_t f = 0u; f < nfuns(); ++f)
779  {
780  // If it has one non-9, then add it to the list
781  // and continue to the next node.
782  if (n.second.annotations[f] != 9u) {
783  ids.push_back(n.second.id);
784  break;
785  }
786  }
787 
788  }
789 
790  return ids;
791 
792 }
793 
794 inline std::vector< size_t > Geese::get_annotations() const {
795 
796  // Makeing space for the annotations
797  std::vector< size_t > ann(this->nfuns() * this->nnodes(), 9u);
798  size_t nrows = this->nnodes();
799  for (auto & n : nodes)
800  {
801 
802  // Getting the location
803  size_t row = n.second.ord;
804 
805  // Counting non-9 annotations
806  for (size_t f = 0u; f < nfuns(); ++f)
807  {
808  // If it has one non-9, then add it to the list
809  // and continue to the next node.
810  if (n.second.annotations[f] != 9u) {
811  ann[f * nrows + row] = n.second.annotations[f];
812  }
813  }
814 
815 
816  }
817 
818  return ann;
819 
820 }
821 
822 
823 #endif
#define printf_barry
Annotated Phylo Model.
std::vector< size_t > reduced_sequence
std::vector< std::vector< bool > > get_states() const
Powerset of a gene's possible states.
Definition: geese-meat.hpp:767
std::vector< size_t > sequence
void calc_reduced_sequence()
Definition: geese-meat.hpp:361
std::vector< std::vector< std::vector< size_t > > > pset_loc
Locations of columns.
std::vector< size_t > get_annotations() const
Returns the annotations of the nodes with at least one annotation.
Definition: geese-meat.hpp:794
void print() const
Prints information about the GEESE.
Definition: geese-meat.hpp:658
bool delete_rengine
PhyloSupport * get_support_fun()
Definition: geese-meat.hpp:763
PhyloCounters * get_counters()
Definition: geese-meat.hpp:754
size_t support_size() const noexcept
Number of unique sets of sufficient stats.
Definition: geese-meat.hpp:463
void init_node(Node &n)
Definition: geese-meat.hpp:6
void print_observed_counts()
Definition: geese-meat.hpp:595
size_t nfunctions
size_t parse_polytomies(bool verb=true, std::vector< size_t > *dist=nullptr) const noexcept
Check polytomies and return the largest.
Definition: geese-meat.hpp:489
std::vector< std::string > colnames() const
Names of the terms in the model.
Definition: geese-meat.hpp:482
void init(size_t bar_width=BARRY_PROGRESS_BAR_WIDTH)
Definition: geese-meat.hpp:131
size_t nnodes() const noexcept
Number of nodes (interior + leaf)
Definition: geese-meat.hpp:437
std::vector< double > get_probabilities() const
Definition: geese-meat.hpp:409
PhyloModel * get_model()
Definition: geese-meat.hpp:759
std::vector< std::vector< double > > observed_counts()
Definition: geese-meat.hpp:524
size_t nleafs() const noexcept
Number of leaf.
Definition: geese-meat.hpp:444
bool delete_support
bool initialized
void calc_sequence(Node *n=nullptr)
Definition: geese-meat.hpp:317
void inherit_support(const Geese &model_, bool delete_support_=false)
Definition: geese-meat.hpp:260
std::vector< size_t > get_annotated_nodes() const
Returns the ids of the nodes with at least one annotation.
Definition: geese-meat.hpp:771
std::vector< size_t > nannotations() const noexcept
Number of annotations.
Definition: geese-meat.hpp:473
void print_nodes() const
Definition: geese-meat.hpp:676
void update_annotations(size_t nodeid, std::vector< size_t > newann)
Definition: geese-meat.hpp:288
std::map< size_t, Node > nodes
std::mt19937 * get_rengine()
Definition: geese-meat.hpp:749
barry::MapVec_type< size_t > map_to_state_id
size_t nterms() const
Number of terms included.
Definition: geese-meat.hpp:456
size_t nfuns() const noexcept
Number of functions analyzed.
Definition: geese-meat.hpp:430
Data definition for the PhyloArray class.
Definition: geese-types.hpp:15
A single node for the model.
std::vector< size_t > annotations
Observed annotations (only defined for Geese)
PhyloArray array
Array of the node.
bool duplication
std::vector< PhyloArray > arrays
Arrays given all possible states.
size_t id
Id of the node (as specified in the input)
std::vector< Node * > offspring
Offspring nodes.
std::vector< size_t > narray
ID of the array in the model.
bool visited
Node * parent
Parent node.
size_t ord
Order in which the node was created.
bool is_leaf() const noexcept
std::vector< double > subtree_prob
Induced subtree probabilities.
return res
size_t size_t j
size_t i
Data_Type &&counter_ noexcept
#define INITIALIZED()
Definition: geese-bones.hpp:22
std::vector< double > keygen_full(const PhyloArray &array, const PhyloCounterData *d)
Definition: geese-bones.hpp:36
barry::Counters< PhyloArray, PhyloCounterData > PhyloCounters
barry::Model< PhyloArray, PhyloCounterData, PhyloRuleData, PhyloRuleDynData > PhyloModel
barry::BArrayDense< size_t, NodeData > PhyloArray
barry::StatsCounter< PhyloArray, PhyloCounterData > PhyloStatsCounter
barry::PowerSet< PhyloArray, PhyloRuleData > PhyloPowerSet
barry::Support< PhyloArray, PhyloCounterData, PhyloRuleData, PhyloRuleDynData > PhyloSupport