barry: Your go-to motif accountant  0.0-1
Full enumeration of sample space and fast count of sufficient statistics for binary arrays
defm-meat.hpp
Go to the documentation of this file.
1 #ifndef DEFM_MEAT_HPP
2 #define DEFM_MEAT_HPP 1
3 
4 inline std::vector< double > keygen_defm(
5  const DEFMArray & Array_,
7  ) {
8 
9  size_t nrow = Array_.nrow();
10  size_t ncol = Array_.ncol();
11 
12  std::vector< double > res;
13  res.reserve(
14  2u + // Rows + cols
15  ncol * (nrow - 1u) // Markov cells
16  );
17 
18  res.push_back(static_cast<double>(nrow));
19  res.push_back(static_cast<double>(ncol));
20 
21  // size_t iter = 2u;
22  // Adding the cells
23  for (size_t i = 0u; i < (nrow - 1); ++i)
24  for (size_t j = 0u; j < ncol; ++j)
25  res.push_back(Array_(i, j));
26 
27  return res;
28 
29 }
30 
31 #define DEFM_RANGES(a) \
32  size_t start_i = start_end[a * 2u];\
33  size_t end_i = start_end[a * 2u + 1u];\
34  size_t nobs_i = end_i - start_i + 1u;
35 
36 #define DEFM_LOOP_ARRAYS(a) \
37  for (size_t a = 0u; a < (nobs_i - M_order); ++a)
38 
39 inline void DEFM::simulate(
40  std::vector< double > par,
41  int * y_out
42 ) {
43 
44  size_t model_num = 0u;
45  size_t n_entry = M_order * Y_ncol;
46  auto idx = this->get_arrays2support();
47  DEFMArray last_array;
48  for (size_t i = 0u; i < N; ++i)
49  {
50 
51  // Figuring out how many processes can we observe
52  DEFM_RANGES(i)
53 
54  DEFM_LOOP_ARRAYS(proc_n)
55  {
56 
57  // In the first process, we take the data as is
58  if (proc_n == 0u)
59  {
60  last_array = this->sample(idx->at(model_num++), par);
61  for (size_t y = 0u; y < Y_ncol; ++y)
62  *(y_out + n_entry++) = last_array(M_order, y, false);
63 
64  // last_array.print("i: %li, proc_n: %li\n", i, proc_n);
65 
66  }
67  else
68  // Otherwise, we need to continue using the previous data!
69  {
70  // Removing the previous row
71  DEFMArray tmp_array(M_order + 1u, Y_ncol);
72  for (size_t t_i = 1u; t_i < (M_order + 1u); ++t_i)
73  for (size_t t_j = 0u; t_j < Y_ncol; ++t_j)
74  tmp_array(t_i - 1u, t_j) = last_array(t_i, t_j);
75 
76  // Setting the data
77  tmp_array.set_data(
78  new DEFMData(
79  &tmp_array, X, (start_i + proc_n), X_ncol, ID_length,
80  this->column_major
81  ),
82  true // Delete the data
83  );
84 
85  // Baseline
86  // tmp_array.print("baseline i: %li, proc_n: %li\n", i, proc_n);
87  // tmp_array.D().print();
88 
89  model_num++;
90  last_array = this->sample(tmp_array, par);
91  for (size_t y = 0u; y < Y_ncol; ++y)
92  *(y_out + n_entry++) = last_array(M_order, y, false);
93 
94  // last_array.print("generated i: %li, proc_n: %li\n", i, proc_n);
95 
96  }
97 
98 
99 
100  }
101 
102  n_entry += M_order * Y_ncol;
103 
104  }
105 
106 }
107 
108 inline DEFM::DEFM(
109  int * id,
110  int * y,
111  double * x,
112  size_t id_length,
113  size_t y_ncol,
114  size_t x_ncol,
115  size_t m_order,
116  bool copy_data,
117  bool column_major
118 ) : column_major(column_major) {
119 
120  // Pointers
121  if (copy_data)
122  {
123 
124  ID_shared = std::make_shared< std::vector<int> >(id_length);
125  Y_shared = std::make_shared< std::vector<int> >(id_length * y_ncol);
126  X_shared = std::make_shared< std::vector<double> >(id_length * x_ncol);
127 
128  for (size_t i = 0u; i < id_length; ++i)
129  ID_shared->at(i) = *(id + i);
130 
131  for (size_t i = 0u; i < (id_length * y_ncol); ++i)
132  Y_shared->at(i) = *(y + i);
133 
134  for (size_t i = 0u; i < (id_length * x_ncol); ++i)
135  X_shared->at(i) = *(x + i);
136 
137  ID = &ID_shared->at(0u);
138  Y = &Y_shared->at(0u);
139  X = &X_shared->at(0u);
140 
141  } else {
142 
143  ID = id;
144  Y = y;
145  X = x;
146 
147  }
148 
149  // Overall dimmensions
150  ID_length = id_length;
151 
152  Y_ncol = y_ncol;
153  Y_length = y_ncol * id_length;
154 
155  X_ncol = x_ncol;
156  X_length = x_ncol * id_length;
157 
158  M_order = m_order;
159 
160  // Creating the model and engine
161  this->rengine = new std::mt19937();
162  this->delete_rengine = true;
163 
164  this->store_psets();
165  auto kgen = keygen_defm;
166  this->add_hasher(kgen);
167 
168  // Iterating for adding observations
169  start_end.reserve(id_length);
170  start_end.push_back(0);
171 
172  // Identifying the start and end of each observation
173  N = 0u;
174  for (size_t row = 1u; row < id_length; ++row)
175  {
176 
177  // Still in the individual
178  if (*(id + row) != *(id + row - 1u))
179  {
180 
181  // End of the previous observation
182  start_end.push_back(row - 1u);
183 
184  // In the case that the start and end do not fit
185  // within the markov process order, then it should fail
186  size_t n_rows_i = (row - 1u) - start_end[N++ * 2u] + 1;
187  if (n_rows_i < (M_order + 1u))
188  throw std::length_error(
189  "Obs. id: " + std::to_string(*(id + row - 1u)) + " (row " +
190  std::to_string(row) + ") has fewer rows (" +
191  std::to_string(n_rows_i) + ") than those needed (" +
192  std::to_string(M_order + 1) + ") for the Markov Model."
193  );
194 
195  // Beginning of the current
196  start_end.push_back(row);
197 
198  }
199 
200  }
201 
202  start_end.push_back(id_length - 1u);
203 
204  N++;
205 
206  // Creating the names
207  for (auto i = 0u; i < Y_ncol; ++i)
208  Y_names.emplace_back(std::string("y") + std::to_string(i));
209 
210  for (auto i = 0u; i < X_ncol; ++i)
211  X_names.emplace_back(std::string("X") + std::to_string(i));
212 
213  return;
214 
215 }
216 
217 
218 inline void DEFM::init()
219 {
220 
221  // Adding the rule
222  rules_markov_fixed(this->get_rules(), M_order);
223 
224  // Element access will be contingent on the column major
225  std::function<size_t(size_t,size_t,size_t,size_t)> element_access;
226 
227  if (this->column_major)
228  {
229 
230  element_access = [](size_t i, size_t j, size_t nrow, size_t) -> size_t {
231  return i + j * nrow;
232  };
233 
234  } else {
235 
236  element_access = [](size_t i, size_t j, size_t, size_t ncol) -> size_t {
237  return j + i * ncol;
238  };
239 
240  }
241 
242  // Creating the arrays
243  for (size_t i = 0u; i < N; ++i)
244  {
245 
246  // Figuring out how many processes can we observe
247  size_t start_i = start_end[i * 2u];
248  size_t end_i = start_end[i * 2u + 1u];
249  size_t nobs_i = end_i - start_i + 1u;
250 
251  // Creating the observations.
252  // Number of processes : (N rows) - (Process size)
253  for (size_t n_proc = 0u; n_proc < (nobs_i - M_order); ++n_proc)
254  {
255 
256  // Creating the array for process n_proc and setting the data
257  DEFMArray array(M_order + 1u, Y_ncol);
258  array.set_data(
259  new DEFMData(
260  &array, X, (start_i + n_proc), X_ncol, ID_length,
261  this->column_major
262  ),
263  true // Delete the data
264  );
265 
266  // Filling-out the array
267  for (size_t k = 0u; k < Y_ncol; ++k)
268  for (size_t o = 0u; o < (M_order + 1u); ++o)
269  // array(o, k) = *(Y + k * ID_length + start_i + n_proc + o);
270  array(o, k) = *(Y + element_access(
271  start_i + n_proc + o, // Row
272  k, // Column
273  ID_length, // N_row
274  Y_ncol // N_col
275  ));
276 
277  // Adding to the model
278  model_ord.push_back( this->add_array(array, true) );
279 
280  }
281 
282  }
283 
284 }
285 
286 inline size_t DEFM::get_n_y() const
287 {
288  return Y_ncol;
289 }
290 
291 inline size_t DEFM::get_n_obs() const
292 {
293  return N;
294 }
295 
296 inline size_t DEFM::get_n_covars() const
297 {
298  return X_ncol;
299 }
300 
301 inline size_t DEFM::get_m_order() const
302 {
303  return M_order;
304 }
305 
306 inline size_t DEFM::get_n_rows() const
307 {
308  return ID_length;
309 }
310 
311 inline const int * DEFM::get_Y() const
312 {
313  return Y;
314 }
315 
316 inline const int * DEFM::get_ID() const
317 {
318  return ID;
319 }
320 
321 inline const double * DEFM::get_X() const
322 {
323  return X;
324 }
325 
326 
327 inline barry::FreqTable<int> DEFM::motif_census(
328  std::vector< size_t > idx
329 ) {
330 
331  // Checking all sizes
332  for (const auto & i : idx)
333  if (i >= Y_ncol)
334  throw std::range_error("The -idx- for motif accounting is out of range.");
335 
336  barry::FreqTable<int> ans;
337  std::vector<int> array(idx.size() * (M_order + 1));
338 
339  for (size_t i = 0u; i < N; ++i)
340  {
341 
342  // Figuring out how many processes can we observe
343  DEFM_RANGES(i)
344 
345  DEFM_LOOP_ARRAYS(proc_n)
346  {
347 
348  // Generating an integer array between the parts
349  size_t nele = 0u;
350 
351  for (size_t o = 0u; o < (M_order + 1u); ++o)
352  for (auto & k : idx)
353  array[nele++] = *(Y + k * ID_length + start_i + proc_n + o);
354 
355  ans.add(array, nullptr);
356 
357  }
358 
359  }
360 
361  return ans;
362 
363 }
364 
365 inline std::vector< double > DEFM::logodds(
366  const std::vector< double > & par,
367  size_t i_,
368  size_t j_
369 ) {
370 
371 
372  std::vector< double > res(ID_length, std::nan(""));
373 
374  for (size_t i = 0u; i < N; ++i)
375  {
376 
377  // Figuring out how many processes can we observe
378  DEFM_RANGES(i)
379 
380  DEFM_LOOP_ARRAYS(n_proc)
381  {
382 
383  // Creating the array for process n_proc and setting the data
384  DEFMArray array(M_order + 1u, Y_ncol);
385  array.set_data(
386  new DEFMData(
387  &array, X, (start_i + n_proc), X_ncol, ID_length,
388  this->column_major
389  ),
390  true // Delete the data
391  );
392 
393  // Filling-out the array
394  for (size_t k = 0u; k < Y_ncol; ++k)
395  for (size_t o = 0u; o < (M_order + 1u); ++o)
396  array(o, k) = *(Y + k * ID_length + start_i + n_proc + o);
397 
398  double p_1 = this->conditional_prob(array, par, i_, j_);
399  res[M_order + start_i + n_proc] = std::log(p_1/(1.0 - p_1));
400 
401  }
402 
403  }
404 
405  return res;
406 
407 
408 }
409 
410 inline void DEFM::set_names(
411  std::vector< std::string > Y_names_,
412  std::vector< std::string > X_names_
413 ) {
414 
415  // Checking the length
416  if (Y_names_.size() != Y_ncol)
417  throw std::length_error("The length of Y_names_ doesn't match the number of dependent variables.");
418 
419  if (X_names_.size() != X_ncol)
420  throw std::length_error("The length of X_names_ doesn't match the number of dependent variables.");
421 
422  Y_names = Y_names_;
423  X_names = X_names_;
424 
425 }
426 
427 inline const std::vector<std::string > & DEFM::get_Y_names() const {
428  return Y_names;
429 }
430 
431 inline const std::vector<std::string > & DEFM::get_X_names() const {
432  return X_names;
433 }
434 
435 inline void DEFM::print() const
436 {
437  DEFMModel::print();
438  printf_barry("Model Y variables (%i):\n", static_cast<int>(get_n_y()));
439  int ny = 0;
440  for (const auto & y : get_Y_names())
441  {
442 
443  printf_barry(" % 2i) %s\n", ny++, y.c_str());
444 
445  }
446 }
447 
448 inline std::vector< bool > DEFM::is_motif()
449 {
450  std::vector< bool > res(0u);
451  auto * counterss = DEFMModel::get_counters();
452  for (size_t i = 0u; i < counters->size(); ++i)
453  res.push_back(counterss->operator[](i).data.is_motif);
454 
455  return res;
456 }
457 
458 inline bool DEFM::get_column_major() const noexcept
459 {
460  return column_major;
461 }
462 
463 #undef DEFM_RANGES
464 #undef DEFM_LOOP_ARRAYS
465 
466 #endif
#define printf_barry
Data class used to store arbitrary size_t or double vectors.
Definition: defm-types.hpp:66
Data class for DEFM arrays.
Definition: defm-types.hpp:16
std::vector< bool > is_motif()
Definition: defm-meat.hpp:448
size_t get_n_covars() const
Definition: defm-meat.hpp:296
DEFM(int *id, int *y, double *x, size_t id_length, size_t y_ncol, size_t x_ncol, size_t m_order, bool copy_data=true, bool column_major=true)
Definition: defm-meat.hpp:108
const int * get_Y() const
Definition: defm-meat.hpp:311
void init()
Definition: defm-meat.hpp:218
size_t get_m_order() const
Definition: defm-meat.hpp:301
void set_names(std::vector< std::string > Y_names_, std::vector< std::string > X_names_)
Definition: defm-meat.hpp:410
size_t get_n_rows() const
Definition: defm-meat.hpp:306
const int * get_ID() const
Definition: defm-meat.hpp:316
const double * get_X() const
Definition: defm-meat.hpp:321
bool get_column_major() const noexcept
Definition: defm-meat.hpp:458
std::vector< double > logodds(const std::vector< double > &par, size_t i, size_t j)
Definition: defm-meat.hpp:365
const std::vector< std::string > & get_Y_names() const
Definition: defm-meat.hpp:427
size_t get_n_obs() const
Definition: defm-meat.hpp:291
void print() const
Definition: defm-meat.hpp:435
const std::vector< std::string > & get_X_names() const
Definition: defm-meat.hpp:431
size_t get_n_y() const
Definition: defm-meat.hpp:286
void simulate(std::vector< double > par, int *y_out)
Definition: defm-meat.hpp:39
barry::FreqTable< int > motif_census(std::vector< size_t > idx)
Definition: defm-meat.hpp:327
return res
Data_Type &&counter_ data(std::move(counter_.data))
size_t size_t j
size_t i
Data_Type &&counter_ noexcept
std::vector< double > keygen_defm(const DEFMArray &Array_, DEFMCounterData *data)
Definition: defm-meat.hpp:4
#define DEFM_LOOP_ARRAYS(a)
Definition: defm-meat.hpp:36
#define DEFM_RANGES(a)
Definition: defm-meat.hpp:31
barry::BArrayDense< int, DEFMData > DEFMArray
Definition: defm-types.hpp:3
void rules_markov_fixed(DEFMRules *rules, size_t markov_order)
Number of edges.
Definition: counters.hpp:736