aGrUM 2.3.2
a C++ library for (probabilistic) graphical models
databaseTable.cpp
Go to the documentation of this file.
1/****************************************************************************
2 * This file is part of the aGrUM/pyAgrum library. *
3 * *
4 * Copyright (c) 2005-2025 by *
5 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
6 * - Christophe GONZALES(_at_AMU) *
7 * *
8 * The aGrUM/pyAgrum library is free software; you can redistribute it *
9 * and/or modify it under the terms of either : *
10 * *
11 * - the GNU Lesser General Public License as published by *
12 * the Free Software Foundation, either version 3 of the License, *
13 * or (at your option) any later version, *
14 * - the MIT license (MIT), *
15 * - or both in dual license, as here. *
16 * *
17 * (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) *
18 * *
19 * This aGrUM/pyAgrum library is distributed in the hope that it will be *
20 * useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, *
21 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS *
22 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE *
23 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
26 * OTHER DEALINGS IN THE SOFTWARE. *
27 * *
28 * See LICENCES for more details. *
29 * *
30 * SPDX-FileCopyrightText: Copyright 2005-2025 *
31 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
32 * - Christophe GONZALES(_at_AMU) *
33 * SPDX-License-Identifier: LGPL-3.0-or-later OR MIT *
34 * *
35 * Contact : info_at_agrum_dot_org *
36 * homepage : http://agrum.gitlab.io *
37 * gitlab : https://gitlab.com/agrumery/agrum *
38 * *
39 ****************************************************************************/
40
41
47
49
50#ifndef DOXYGEN_SHOULD_SKIP_THIS
51
53# ifdef GUM_NO_INLINE
55# endif /* GUM_NO_INLINE */
56
57namespace gum::learning {
58
59 // default constructor
60 DatabaseTable::DatabaseTable(const typename DatabaseTable::MissingValType& missing_symbols,
61 const DBTranslatorSet& translators) :
62 IDatabaseTable< DBTranslatedValue >(missing_symbols, std::vector< std::string >()),
63 _translators_(translators) {
64 if (translators.size()) {
65 // set the variables names according to those of the translators
66 std::vector< std::string > var_names(translators.size());
67 for (std::size_t i = std::size_t(0), size = translators.size(); i < size; ++i) {
68 var_names[i] = _translators_.translator(i).variable()->name();
69 }
70 setVariableNames(var_names, false);
71 }
72
73 GUM_CONSTRUCTOR(DatabaseTable);
74 }
75
76 // default constructor
77 DatabaseTable::DatabaseTable(const DBTranslatorSet& translators) :
78 IDatabaseTable< DBTranslatedValue >(std::vector< std::string >(),
79 std::vector< std::string >()),
80 _translators_(translators) {
81 if (translators.size()) {
82 // set the variables names according to those of the translators
83 std::vector< std::string > var_names(translators.size());
84 for (std::size_t i = std::size_t(0), size = translators.size(); i < size; ++i) {
85 var_names[i] = _translators_.translator(i).variable()->name();
86 }
87 setVariableNames(var_names, false);
88 }
89
90 GUM_CONSTRUCTOR(DatabaseTable);
91 }
92
93 // virtual copy constructor
94 DatabaseTable* DatabaseTable::clone() const { return new DatabaseTable(*this); }
95
96 // copy operator
97 DatabaseTable& DatabaseTable::operator=(const DatabaseTable& from) {
98 if (this != &from) {
99 IDatabaseTable< DBTranslatedValue >::operator=(from);
100 _translators_ = from._translators_;
101 _ignored_cols_ = from._ignored_cols_;
102 }
103
104 return *this;
105 }
106
107 // move constructor
108 DatabaseTable& DatabaseTable::operator=(DatabaseTable&& from) noexcept {
109 if (this != &from) {
110 IDatabaseTable< DBTranslatedValue >::operator=(std::move(from));
111 _translators_ = std::move(from._translators_);
112 _ignored_cols_ = std::move(from._ignored_cols_);
113 }
114
115 return *this;
116 }
117
119 std::size_t DatabaseTable::insertTranslator(const DBTranslator& translator,
120 const std::size_t input_column,
121 const bool unique_column) {
122 // check that there is no ignored_column corresponding to column
123 if (_ignored_cols_.exists(input_column))
125 "Column " << input_column << " is marked as being ignored. "
126 << "So it is forbidden to create a translator for that column.")
127
128 // reserve some place for the new column in the records of the database
129 const std::size_t new_size = this->nbVariables() + 1;
130
131 // create the lambda for reserving some memory for the new column
132 // and the one that undoes what it performed if some thread executing
133 // it raised an exception
134 auto reserve_lambda = [this, new_size](std::size_t begin, std::size_t end, std::size_t index) {
135 for (std::size_t i = begin; i < end; ++i)
136 this->rows_[i].row().reserve(new_size);
137 };
138
139 auto undo_reserve_lambda = [](std::size_t begin, std::size_t end, std::size_t index) {};
140
141 // launch the threads executing the lambdas
142 this->_threadProcessDatabase_(reserve_lambda, undo_reserve_lambda);
143
144 // insert the translator into the translator set
145 const std::size_t pos = _translators_.insertTranslator(translator, input_column, unique_column);
146
147 // insert the name of the translator's variable to the set of variable names
148 try {
149 this->variable_names_.push_back(translator.variable()->name());
150 } catch (...) {
151 _translators_.eraseTranslator(pos);
152 throw;
153 }
154
155 // if the databaseTable is not empty, fill the column of the database
156 // corresponding to the translator with missing values
157 if (!IDatabaseTable< DBTranslatedValue >::empty()) {
158 const DBTranslatedValue missing = _translators_[pos].missingValue();
159
160 // create the lambda for adding a new column filled wih a missing value
161 auto fill_lambda = [this, missing](std::size_t begin, std::size_t end, std::size_t index) {
162 std::size_t i = begin;
163 try {
164 for (; i < end; ++i) {
165 this->rows_[i].row().push_back(missing);
166 this->has_row_missing_val_[i] = IsMissing::True;
167 }
168 } catch (...) {
169 for (std::size_t j = begin; j < i; ++j)
170 this->rows_[i].row().pop_back();
171 throw;
172 }
173 };
174
175 auto undo_fill_lambda = [this](std::size_t begin, std::size_t end, std::size_t index) {
176 for (std::size_t i = begin; i < end; ++i)
177 this->rows_[i].row().pop_back();
178 };
179
180 // launch the threads executing the lambdas
181 this->_threadProcessDatabase_(fill_lambda, undo_fill_lambda);
182 }
183
184 return pos;
185 }
186
188 std::size_t DatabaseTable::insertTranslator(const Variable& var,
189 const std::size_t input_column,
190 const bool unique_column) {
191 // check that there is no ignored_column corresponding to column
192 if (_ignored_cols_.exists(input_column))
194 "Column " << input_column << " is marked as being ignored. "
195 << "So it is forbidden to create a translator for that column.")
196
197 // if the databaseTable is not empty, we should fill the column of the
198 // database corresponding to the new translator with missing values. But, the
199 // current method assumes that the list of missing values is empty. Hence, it
200 // should raise an exception
201 if (!IDatabaseTable< DBTranslatedValue >::empty()) {
203 "inserting a new translator into a database creates a new column "
204 << "with missing values. However, you did not define any symbol for "
205 << "such values.")
206 }
207
208 // reserve some place for the new column in the records of the database
209 const std::size_t new_size = this->nbVariables() + 1;
210
211 // create the lambda for reserving some memory for the new column
212 // and the one that undoes what it performed if some thread executing
213 // it raised an exception
214 auto reserve_lambda = [this, new_size](std::size_t begin, std::size_t end, std::size_t index) {
215 for (std::size_t i = begin; i < end; ++i)
216 this->rows_[i].row().reserve(new_size);
217 };
218
219 auto undo_reserve_lambda = [](std::size_t begin, std::size_t end, std::size_t index) {};
220
221 // launch the threads executing the lambdas
222 this->_threadProcessDatabase_(reserve_lambda, undo_reserve_lambda);
223
224 // insert the translator into the translator set
225 const std::size_t pos = _translators_.insertTranslator(var, input_column, unique_column);
226
227 // insert the name of the translator's variable to the set of variable names
228 try {
229 this->variable_names_.push_back(var.name());
230 } catch (...) {
231 _translators_.eraseTranslator(pos);
232 throw;
233 }
234
235 return pos;
236 }
237
239 std::size_t DatabaseTable::insertTranslator(const Variable& var,
240 const std::size_t input_column,
241 const std::vector< std::string >& missing_symbols,
242 const bool unique_column) {
243 // check that there is no ignored_column corresponding to column
244 if (_ignored_cols_.exists(input_column))
246 "Column " << input_column << " is marked as being ignored. "
247 << "So it is forbidden to create a translator for that column.")
248
249 // reserve some place for the new column in the records of the database
250 const std::size_t new_size = this->nbVariables() + 1;
251
252 // create the lambda for reserving some memory for the new column
253 // and the one that undoes what it performed if some thread executing
254 // it raised an exception
255 auto reserve_lambda
256 = [this, new_size](std::size_t begin, std::size_t end, std::size_t index) -> void {
257 for (std::size_t i = begin; i < end; ++i)
258 this->rows_[i].row().reserve(new_size);
259 };
260
261 auto undo_reserve_lambda = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
262
263 // launch the threads executing the lambdas
264 this->_threadProcessDatabase_(reserve_lambda, undo_reserve_lambda);
265
266 // insert the translator into the translator set
267 const std::size_t pos
268 = _translators_.insertTranslator(var, input_column, missing_symbols, unique_column);
269
270 // insert the name of the translator's variable to the set of variable names
271 try {
272 this->variable_names_.push_back(var.name());
273 } catch (...) {
274 _translators_.eraseTranslator(pos);
275 throw;
276 }
277
278 // if the databaseTable is not empty, fill the column of the database
279 // corresponding to the translator with missing values
280 if (!IDatabaseTable< DBTranslatedValue >::empty()) {
281 const DBTranslatedValue missing = _translators_[pos].missingValue();
282
283 // create the lambda for adding a new column filled wih a missing value
284 auto fill_lambda
285 = [this, missing](std::size_t begin, std::size_t end, std::size_t index) -> void {
286 std::size_t i = begin;
287 try {
288 for (; i < end; ++i) {
289 this->rows_[i].row().push_back(missing);
290 this->has_row_missing_val_[i] = IsMissing::True;
291 }
292 } catch (...) {
293 for (std::size_t j = begin; j < i; ++j)
294 this->rows_[i].row().pop_back();
295 throw;
296 }
297 };
298
299 auto undo_fill_lambda
300 = [this](std::size_t begin, std::size_t end, std::size_t index) -> void {
301 for (std::size_t i = begin; i < end; ++i)
302 this->rows_[i].row().pop_back();
303 };
304
305 // launch the threads executing the lambdas
306 this->_threadProcessDatabase_(fill_lambda, undo_fill_lambda);
307 }
308
309 return pos;
310 }
311
312 // erases the kth translator or all those parsing the kth column of
313 // the input dataset
314 void DatabaseTable::eraseTranslators(const std::size_t k, const bool k_is_input_col) {
315 for (const auto kk: _getKthIndices_(k, k_is_input_col)) {
316 // erase the translator of index kk and the corresponding variable
317 // name. If there remains no more translator in the translator set,
318 // rows_ should become empty
319 this->variable_names_.erase(this->variable_names_.begin() + kk);
320 if (this->variable_names_.empty()) {
321 IDatabaseTable< DBTranslatedValue >::eraseAllRows();
322 } else {
323 const std::size_t nb_trans = _translators_.size();
324
325 auto erase_lambda
326 = [this, nb_trans, kk](std::size_t begin, std::size_t end, std::size_t index) -> void {
327 for (std::size_t i = begin; i < end; ++i) {
328 auto& row = this->rows_[i].row();
329 if (this->_translators_.isMissingValue(row[kk], kk)) {
330 bool has_missing_val = false;
331 for (std::size_t j = std::size_t(0); j < nb_trans; ++j) {
332 if ((j != kk) && this->_translators_.isMissingValue(row[j], j)) {
333 has_missing_val = true;
334 break;
335 }
336 }
337 if (!has_missing_val) this->has_row_missing_val_[i] = IsMissing::False;
338 }
339 row.erase(row.begin() + kk);
340 }
341 };
342
343 auto undo_erase_lambda
344 = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
345
346 // launch the threads executing the lambdas
347 this->_threadProcessDatabase_(erase_lambda, undo_erase_lambda);
348 }
349 _translators_.eraseTranslator(kk);
350 }
351 }
352
354 void DatabaseTable::changeTranslator(DBTranslator& new_translator,
355 const std::size_t k,
356 const bool k_is_input_col) {
357 // get the index of the column in the database. If it is not found, indicate that
358 // the substitution is impossible
359 const auto db_k = _getKthIndices_(k, k_is_input_col);
360 if (db_k.empty()) {
361 GUM_ERROR(OutOfBounds, "the translator at position " << k << " cannot be found.");
362 }
363 const std::size_t kk = db_k[db_k.size() - 1];
364 if (kk >= _translators_.size()) {
365 GUM_ERROR(OutOfBounds, "the translator at position " << k << " cannot be found.");
366 }
367
368
369 // if the dataset does not contain any data, we can safely substitute the old translator
370 // by the new one
371 if (this->empty()) {
372 // keep into account the name of the new translator
373 this->variable_names_[kk] = new_translator.variable()->name();
374
375 // substitute int the stransltor's set the old translator by the new one
376 _translators_.changeTranslator(new_translator, kk);
377
378 return;
379 }
380
381 // get the translator and check that it is not lossy: as, here, there are some data,
382 // we cannot always ensure that there won't be some loss of information substituting
383 // one translator by another
384 DBTranslator& old_translator = _translators_[kk];
385 if (!old_translator.isLossless()) {
386 // for the moment, we consider that it is impossible to substitute lossy translators
387 // because we may have already lost information that are necessary for the new
388 // translator
390 "Lossy translators cannot yet be substituted by other translators");
391 }
392
393 const std::size_t nb_threads = this->nbProcessingThreads_();
394
395 // how missing values will be translated
396 std::pair< DBTranslatedValue, DBTranslatedValue > miss_mapping(old_translator.missingValue(),
397 new_translator.missingValue());
398
399 // Now, we should compute the mapping from the values and missing symbols of the old
400 // translator to those of the new one.
401
402 // When the database already contains some data, we must ensure that we will be able to
403 // substitute the old translator by the new one without loosing any information. Possible
404 // loss of information may occur in the following cases:
405 // 1/ if the set of missing symbols of the old translator is not a singleton and some of its
406 // missing symbols do not belong to the set of missing symbols of the new translator.
407 // In this case, the translation of this symbol by the new translator should either raise
408 // an exception because the new translator does not know how to handle it, or should
409 // produce a DBTranslatedValue if the new translator thinks this is an observed value.
410 // Now, the problem is that when observing a missing symbol in the database, we have no
411 // way to determine to which above case this should correspond. Hence the substitution
412 // cannot be made unambiguously.
413 // 2/ if the set of (non-missing) values of the old translator is not included in the one
414 // of the new translator
415 // If one of these cases occur, before performing the translation, we must parse the content
416 // of the database: if case 1/ obtains and if the database contains some missing symbols,
417 // then we cannot unambiguously substitute the old translator by the new one, hence an error.
418 // If case 2/ obtains, we must check that all the observed values currently stored into the
419 // database also belong to the set of values the new translator is capable of translating.
420 if (!this->empty()) {
421 // to test case 1, we first determine whether the dataset contains some
422 // missing values
423 bool has_missing_value = false;
424 {
425 std::vector< int > missing_values(nb_threads, 0);
426
427 // a lambda to parse all the translated values for missing symbols
428 auto get_lambda = [this, kk, &missing_values](std::size_t begin,
429 std::size_t end,
430 std::size_t index) -> void {
431 for (std::size_t i = begin; i < end; ++i) {
432 auto& row = this->rows_[i].row();
433 if (this->_translators_.isMissingValue(row[kk], kk)) {
434 missing_values[index] = 1;
435 return;
436 }
437 }
438 };
439
440 auto undo_get_lambda = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
441
442 // launch the threads executing the lambdas
443 this->_threadProcessDatabase_(get_lambda, undo_get_lambda);
444
445 // if has_missing_values has at least one value 1, there are missing values
446 for (const auto x: missing_values) {
447 if (x) {
448 has_missing_value = true;
449 break;
450 }
451 }
452 }
453
454 // test for case 1/
455 const auto old_missing_symbols = old_translator.missingSymbols();
456 const auto new_missing_symbols = new_translator.missingSymbols();
457 const bool multiple_missing_symbols = old_missing_symbols.size() > 1;
458 const bool old_miss_included = old_missing_symbols.isSubsetOrEqual(new_missing_symbols);
459 if (has_missing_value && multiple_missing_symbols && !old_miss_included) {
460 // here, we know that the the database contains missing values
461 // and we cannot unambiguously perform the translator's substitution
463 "it is impossible to substitute the translator because "
464 "the database contains some missing values that cannot be "
465 "substituted unambiguously");
466 }
467
468 // if the database contains some missing values, two cases can obtain:
469 // a/ old_miss_included is true, in which case all the old missing values
470 // will be translated as missing values in the new translator.
471 // In this case, there is no translation problem.
472 // b/ old_miss_included is false. In this case, we know that there is only
473 // one old missing symbol, which is not inluded in the set of missing
474 // symbols of the new translator. If we can translate its symbol as a
475 // "proper" value in the new translator, that's ok, otherwise we cannot
476 // perform the substitution.
477 if (has_missing_value && !old_miss_included) {
478 try {
479 new_translator.translate(*(old_translator.missingSymbols().begin()));
480 } catch (Exception const&) {
482 "it is impossible to substitute the translator because "
483 "the database contains some missing values that cannot be "
484 "substituted");
485 }
486 }
487
488 // compute the mapping of the missing symbol if this one does not corresponds
489 // to a missing value in the new translator
490 if (has_missing_value && !old_miss_included) {
491 miss_mapping.second = new_translator.translate(*(old_translator.missingSymbols().begin()));
492 }
493
494 // test for case 2/ (if the set of (non-missing) values of the old translator is
495 // not included in the one of the new translator)
496
497 // now, parse the database and check that all the values contained in the
498 // database can be translated
499 std::vector< int > unmapped(nb_threads, 0);
500
501 // a lambda to parse all the translated values
502 auto check_lambda
503 = [this, kk, &old_translator, &new_translator, &unmapped](std::size_t begin,
504 std::size_t end,
505 std::size_t index) -> void {
506 const auto old_miss = old_translator.missingValue().discr_val;
507 for (std::size_t i = begin; i < end; ++i) {
508 const auto& row = this->rows_[i].row();
509 if (row[kk].discr_val != old_miss) {
510 try {
511 new_translator.translate(old_translator.translateBack(row[kk]));
512 } catch (Exception const&) {
513 // ok, here, the translation is impossible
514 unmapped[index] = 1;
515 return;
516 }
517 }
518 }
519 };
520
521 auto undo_check_lambda = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
522
523 // launch the threads executing the lambdas
524 this->_threadProcessDatabase_(check_lambda, undo_check_lambda);
525
526 // if unmapped has at least one value 1, there are values that we don't know how to
527 // translate
528 for (const auto x: unmapped) {
529 if (x) {
531 "The database contains some values that cannot be translated "
532 "using the new translator");
533 }
534 }
535 }
536
537 // here, we know that we can perform the translator's substitution, so
538 // let's do it
539 auto change_lambda
540 = [this, kk, &old_translator, &new_translator, miss_mapping](std::size_t begin,
541 std::size_t end,
542 std::size_t index) -> void {
543 const auto old_miss = old_translator.missingValue().discr_val;
544 for (std::size_t i = begin; i < end; ++i) {
545 auto& row = this->rows_[i].row();
546 if (row[kk].discr_val == old_miss) {
547 row[kk] = miss_mapping.second;
548 } else {
549 row[kk] = new_translator.translate(old_translator.translateBack(row[kk]));
550 }
551 }
552 };
553
554 auto undo_change_lambda = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
555
556 // launch the threads executing the lambdas
557 this->_threadProcessDatabase_(change_lambda, undo_change_lambda);
558
559 // keep into account the name of the new translator
560 this->variable_names_[kk] = new_translator.variable()->name();
561
562 // substitute int the stransltor's set the old translator by the new one
563 _translators_.changeTranslator(new_translator, kk);
564 }
565
567 void DatabaseTable::changeTranslator(const Variable& var,
568 const std::size_t k,
569 const bool k_is_input_col,
570 const std::vector< std::string >& missing_symbols,
571 const bool editable_dictionary,
572 const std::size_t max_dico_entries) {
573 // get the appropriate set of missing symbols
574 std::vector< std::string > missing;
575 if (missing_symbols.empty()) {
576 // try to get the missing symbols of the current translator
577 const auto db_k = _getKthIndices_(k, k_is_input_col);
578 if (db_k.empty()) {
579 GUM_ERROR(OutOfBounds, "the translator at position " << k << " cannot be found.");
580 }
581 const std::size_t kk = db_k[db_k.size() - 1];
582 if (kk >= _translators_.size()) {
583 GUM_ERROR(OutOfBounds, "the translator at position " << k << " cannot be found.");
584 }
585
586 const auto& miss = _translators_[kk].missingSymbols();
587 missing.reserve(miss.size());
588 for (const auto& m: missing) {
589 missing.push_back(m);
590 }
591 } else {
592 missing = missing_symbols;
593 }
594
595 // create the DBTranslator corresponding to the parameters
596 DBTranslator* new_translator
597 = DBTranslators::create(var, missing, editable_dictionary, max_dico_entries);
598
599 try {
600 changeTranslator(*new_translator, k, k_is_input_col);
601 } catch (...) {
602 // remove from memory new_translator
603 delete new_translator;
604 throw;
605 }
606
607 // remove from memory new_translator
608 delete new_translator;
609 }
610
612 std::vector< std::pair< Idx, std::shared_ptr< DBTranslator > > >
613 DatabaseTable::betterTranslators() const {
614 std::vector< std::pair< Idx, std::shared_ptr< DBTranslator > > > better;
615
616 for (Idx i = 0, size = _translators_.size(); i < size; ++i) {
617 switch (_translators_[i].variable()->varType()) {
618 // if the translator is discretized, range or continuous, we cannot
619 // improve it
620 case VarType::CONTINUOUS :
621 case VarType::NUMERICAL :
622 case VarType::DISCRETIZED :
623 case VarType::RANGE : break;
624
625 // if the translator can only translate integers ans all the numbers
626 // are consecutive, prefer a range variable
627 case VarType::INTEGER : {
628 const auto& var = dynamic_cast< const IntegerVariable& >(*(_translators_[i].variable()));
629
630 // check that the values in the domain are consecutive
631 const auto& domain = var.integerDomain();
632 if (domain.empty()) break; // we cannot get a better translator
633 int prev = domain[0] - 1;
634 bool ok = true;
635 for (const auto elt: domain) {
636 if (elt != prev + 1) {
637 ok = false;
638 break;
639 } else {
640 prev = elt;
641 }
642 }
643
644 // here, we know that the values are consecutive, hence we can
645 // change the variable to a range variable
646 if (ok) {
647 RangeVariable new_var(var.name(),
648 var.description(),
649 domain[0],
650 domain[domain.size() - 1]);
651
652 // get the set of missing symbols
653 const auto& missing = _translators_[i].missingSymbols();
654 std::vector< std::string > new_missing;
655 new_missing.reserve(missing.size());
656 for (const auto& miss: missing) {
657 new_missing.push_back(miss);
658 }
659 auto new_trans = new DBTranslator4RangeVariable(new_var, new_missing);
660 better.push_back(std::pair< Idx, std::shared_ptr< DBTranslator > >(
661 i,
662 std::shared_ptr< DBTranslator >(new_trans)));
663 }
664 break;
665 }
666
667 // if the translator is a set of labels, check whether those are all
668 // numbers. In this case, if they are integers and consecutive,
669 // prefer a RangeVariable; if they are integers but not consecutive,
670 // prefer an IntegerVariable, else check whether a continuous
671 // variable could be ok
672 case VarType::LABELIZED : {
673 const auto& var
674 = dynamic_cast< const LabelizedVariable& >(*(_translators_[i].variable()));
675 if (!var.domainSize()) break; // we cannot get a better translator
676
677 // get the numerical values of the labels
678 Set< int > int_values;
679 Set< float > real_values;
680 bool ok = true;
681 for (Idx j = 0, s = var.domainSize(); j < s; ++j) {
682 const auto& val = var.label(j);
683 if (DBCell::isReal(val)) {
684 if (DBCell::isInteger(val)) {
685 int_values.insert(std::stoi(val));
686 } else {
687 real_values.insert(std::stof(val));
688 }
689 } else {
690 ok = false;
691 break;
692 }
693 }
694
695 // if there are only numerical values, we can certainly do something
696 if (ok) {
697 // check whether an IntegerVariable or a Range variable would do
698 if (real_values.empty()) {
699 // get the values in increasing order
700 std::vector< int > values;
701 values.reserve(int_values.size());
702 for (const auto val: int_values)
703 values.push_back(val);
704 std::sort(values.begin(), values.end());
705
706 // if all the values are consecutive, then a range variable
707 // would be best
708 int prev = values[0] - 1;
709 bool consecutive = true;
710 for (const auto elt: values) {
711 if (elt != prev + 1) {
712 consecutive = false;
713 break;
714 } else {
715 prev = elt;
716 }
717 }
718
719 if (consecutive) {
720 // here, we should create a range variable since all the
721 // values in the domain are consecutive
722 RangeVariable new_var(var.name(), var.description(), values[0], values.back());
723
724 // get the set of missing symbols
725 const auto& missing = _translators_[i].missingSymbols();
726 std::vector< std::string > new_missing;
727 new_missing.reserve(missing.size());
728 for (const auto& miss: missing) {
729 new_missing.push_back(miss);
730 }
731 auto new_trans = new DBTranslator4RangeVariable(new_var, new_missing);
732 better.emplace_back(i, std::shared_ptr< DBTranslator >(new_trans));
733 } else {
734 // here, the values in the domain are not consecutive, hence
735 // it would be better to create an IntegerVariable
736 IntegerVariable new_var(var.name(), var.description(), values);
737
738 // get the set of missing symbols
739 const auto& missing = _translators_[i].missingSymbols();
740 std::vector< std::string > new_missing;
741 new_missing.reserve(missing.size());
742 for (const auto& miss: missing) {
743 new_missing.push_back(miss);
744 }
745 auto new_trans = new DBTranslator4IntegerVariable(new_var, new_missing);
746 better.emplace_back(i, std::shared_ptr< DBTranslator >(new_trans));
747 }
748 } else {
749 // here, a translator for continuous variable would be suited
750 ContinuousVariable new_var(var.name(), var.description());
751
752 // get the set of missing symbols
753 const auto& missing = _translators_[i].missingSymbols();
754 std::vector< std::string > new_missing;
755 new_missing.reserve(missing.size());
756 for (const auto& miss: missing) {
757 new_missing.push_back(miss);
758 }
759 auto new_trans = new DBTranslator4ContinuousVariable(new_var, new_missing);
760 better.emplace_back(i, std::shared_ptr< DBTranslator >(new_trans));
761 }
762 }
763 break;
764 }
765 }
766 }
767
768 return better;
769 }
770
772 const DBTranslator& DatabaseTable::translator(const std::size_t k,
773 const bool k_is_input_col) const {
774 // find the position of the translator that we look for. This
775 // is variable kk below
776 const std::size_t nb_trans = _translators_.size();
777 const std::size_t kk = _getKthIndex_(k, k_is_input_col);
778
779 // check if the translator exists
780 if (nb_trans <= kk) {
781 if (k_is_input_col) {
783 "there is no translator in the database table that " << "parses Column " << k)
784 } else {
786 "the database has " << nb_trans << " translators, so Translator #" << k
787 << " does not exist")
788 }
789 }
790
791 return _translators_.translator(kk);
792 }
793
795 const Variable& DatabaseTable::variable(const std::size_t k, const bool k_is_input_col) const {
796 // find the position of the translator that contains the variable.
797 // This is variable kk below
798 const std::size_t nb_trans = _translators_.size();
799 const std::size_t kk = _getKthIndex_(k, k_is_input_col);
800
801 // check if the translator exists
802 if (nb_trans <= kk) {
803 if (k_is_input_col) {
805 "there is no variable in the database table that " << "corresponds to Column "
806 << k)
807 } else {
809 "the database has " << nb_trans << " variables, so Variable #" << k
810 << " does not exist")
811 }
812 }
813
814 return _translators_.variable(kk);
815 }
816
818 void DatabaseTable::setVariableNames(const std::vector< std::string >& names,
819 const bool from_external_object) {
820 const std::size_t size = names.size();
821 const std::size_t nb_trans = _translators_.size();
822 if (!from_external_object) {
823 if (nb_trans != size) {
825 "the number of variable's names (i.e., "
826 << size << ") does not correspond to the number of columns of the "
827 << "database table (i.e.," << nb_trans << ")")
828 }
829
830 // update the translator names
831 for (std::size_t i = std::size_t(0); i < size; ++i) {
832 _translators_.translator(i).setVariableName(names[i]);
833 }
834 } else {
835 if (nb_trans && (_translators_.highestInputColumn() >= size)) {
837 "the names vector has " << size << " elements whereas it should have at least "
838 << (_translators_.highestInputColumn() + 1)
839 << "elements so that each translator is assigned a name")
840 }
841
842 // update the translator names
843 for (std::size_t i = std::size_t(0); i < nb_trans; ++i) {
844 _translators_.translator(i).setVariableName(names[_translators_.inputColumn(i)]);
845 }
846 }
847
848 // update variable_names_ using the newly assigned translators names
849 this->variable_names_.resize(nb_trans);
850 for (std::size_t i = std::size_t(0); i < nb_trans; ++i) {
851 this->variable_names_[i] = _translators_.variable(i).name();
852 }
853 }
854
857 void DatabaseTable::ignoreColumn(const std::size_t k, const bool k_is_input_col) {
858 // indicate that the column will be forbidden. If the column is already
859 // forbidden, do nothing. But if the column is assigned to a translator
860 // that does not exist, raise an UndefinedElement exception
861 const std::size_t nb_trans = _translators_.size();
862 if (k_is_input_col) {
863 if (_ignored_cols_.exists(k)) return;
864 _ignored_cols_.insert(k);
865 } else {
866 if (k < nb_trans) {
867 _ignored_cols_.insert(_translators_.inputColumn(k));
868 } else {
870 "It is impossible to ignore the column parsed by Translator #"
871 << k << "because there exist only " << nb_trans << " translators")
872 }
873 }
874
875 // remove all the translators corresponding to k
876 eraseTranslators(k, k_is_input_col);
877 }
878
880 const typename DatabaseTable::template DBVector< std::size_t >
881 DatabaseTable::ignoredColumns() const {
882 const std::size_t nb_trans = _translators_.size();
883
884 if (nb_trans == std::size_t(0)) { return DBVector< std::size_t >{std::size_t(0)}; }
885
886 // get the columns handled by the translators, sorted by increasing order
887 DBVector< std::size_t > cols(nb_trans);
888 for (std::size_t i = std::size_t(0); i < nb_trans; ++i)
889 cols[i] = _translators_.inputColumn(i);
890 std::sort(cols.begin(), cols.end());
891
892 // create a vector with all the possible input columns
893 const std::size_t highest = _translators_.highestInputColumn() + 1;
894 DBVector< std::size_t > ignored_cols(highest);
895 std::iota(ignored_cols.begin(), ignored_cols.end(), 0);
896
897 // remove from ignored_cols the elements of cols
898 for (std::size_t i = std::size_t(0), ii = highest - 1, k = std::size_t(0), kk = nb_trans - 1;
899 i < highest;
900 ++i, --ii) {
901 if (cols[kk] == ii) {
902 ignored_cols.erase(ignored_cols.begin() + ii);
903 while ((k < nb_trans) && (cols[kk] == ii)) {
904 --kk;
905 ++k;
906 }
907 if (k == nb_trans) break;
908 }
909 }
910
911 // add the column past the last translator
912 ignored_cols.push_back(highest);
913
914 return ignored_cols;
915 }
916
918 const typename DatabaseTable::template DBVector< std::size_t >
919 DatabaseTable::inputColumns() const {
920 const std::size_t nb_trans = _translators_.size();
921 if (nb_trans == std::size_t(0)) { return DBVector< std::size_t >(); }
922
923 DBVector< std::size_t > input_cols(nb_trans);
924 for (std::size_t i = std::size_t(0); i < nb_trans; ++i) {
925 input_cols[i] = _translators_.inputColumn(i);
926 }
927 return input_cols;
928 }
929
931 std::size_t DatabaseTable::domainSize(const std::size_t k, const bool k_is_input_col) const {
932 // find the position kk of the translator that contains the variable
933 const std::size_t nb_trans = _translators_.size();
934 const std::size_t kk = _getKthIndex_(k, k_is_input_col);
935
936 // check if the translator exists
937 if (nb_trans <= kk) {
938 if (k_is_input_col) {
940 "there is no variable in the database table that " << "corresponds to Column "
941 << k)
942 } else {
944 "the database has " << nb_trans << " variables, so Variable #" << k
945 << " does not exist")
946 }
947 }
948
949 return _translators_.domainSize(kk);
950 }
951
952 // indicates whether a reordering is needed to make the kth
953 // translator sorted by lexicographical order
954 bool DatabaseTable::needsReordering(const std::size_t k, const bool k_is_input_col) const {
955 // find the position kk of the translator that contains the variable
956 const std::size_t nb_trans = _translators_.size();
957 const std::size_t kk = _getKthIndex_(k, k_is_input_col);
958
959 // check if the translator exists
960 if (nb_trans <= kk) {
961 if (k_is_input_col) {
963 "there is no translator in the database table that " << "parses Column " << k)
964 } else {
966 "the database has " << nb_trans << " translators, so Translator #" << k
967 << " does not exist")
968 }
969 }
970
971 return _translators_.needsReordering(kk);
972 }
973
974 // performs a reordering of the kth translator or of the first
975 // translator corresponding to the kth column of the input database
976 void DatabaseTable::reorder(const std::size_t k, const bool k_is_input_col) {
977 // find the position kk of the translator that contains the variable
978 const std::size_t nb_trans = _translators_.size();
979 const std::size_t kk = _getKthIndex_(k, k_is_input_col);
980
981 // check if the translator exists
982 if (nb_trans <= kk) {
983 if (k_is_input_col) {
985 "there is no translator in the database table that " << "parses Column " << k)
986 } else {
988 "the database has " << nb_trans << " translators, so Translator #" << k
989 << " does not exist")
990 }
991 }
992
993 // if the translator is not designed for a discrete variable, there
994 // is no reordering to apply
995 if (_translators_.translator(kk).getValType() != DBTranslatedValueType::DISCRETE) return;
996
997 // get the translation to perform
998 auto updates = _translators_.reorder(kk);
999 if (updates.empty()) return;
1000
1001 std::size_t size = updates.size();
1002 std::vector< std::size_t > new_values(size);
1003 for (const auto& update: updates) {
1004 if (update.first >= size) {
1005 size = update.first + 1;
1006 new_values.resize(size);
1007 }
1008 new_values[update.first] = update.second;
1009 }
1010
1011 // apply the translations
1012 auto newtrans_lambda
1013 = [this, kk, &new_values](std::size_t begin, std::size_t end, std::size_t index) -> void {
1014 for (std::size_t i = begin; i < end; ++i) {
1015 auto& elt = this->rows_[i][kk].discr_val;
1016 if (elt != std::numeric_limits< std::size_t >::max()) elt = new_values[elt];
1017 }
1018 };
1019
1020 auto undo_newtrans_lambda
1021 = [](std::size_t begin, std::size_t end, std::size_t index) -> void {};
1022
1023 // launch the threads executing the lambdas
1024 this->_threadProcessDatabase_(newtrans_lambda, undo_newtrans_lambda);
1025 }
1026
1028 void DatabaseTable::insertRow(const std::vector< std::string >& new_row) {
1029 // check that the row can be fully translated, i.e., it contains enough
1030 // columns to be translated
1031 const std::size_t row_size = new_row.size();
1032 if (row_size == std::size_t(0)) return;
1033
1034 if (_translators_.highestInputColumn() >= row_size) {
1036 "the row #" << 1 + size() << " has " << row_size
1037 << " columns whereas the database requires at least "
1038 << (_translators_.highestInputColumn() + 1) << " columns")
1039 }
1040
1041 // convert the new_row into a row of DBTranslatedValue
1042 const std::size_t nb_trans = _translators_.size();
1043 Row< DBTranslatedValue > dbrow;
1044 dbrow.reserve(nb_trans);
1045 bool has_missing_val = false;
1046 for (std::size_t i = std::size_t(0); i < nb_trans; ++i) {
1047 const DBTranslatedValue new_val(_translators_.translate(new_row, i));
1048 if (_translators_.isMissingValue(new_val, i)) has_missing_val = true;
1049 dbrow.pushBack(std::move(new_val));
1050 }
1051
1052 this->insertRow(std::move(dbrow), has_missing_val ? IsMissing::True : IsMissing::False);
1053 }
1054
1057 bool DatabaseTable::_isRowCompatible_(
1058 const typename DatabaseTable::template Row< DBTranslatedValue >& row) const {
1059 // check that the size of the row corresponds to that of the translators
1060 const std::size_t row_size = row.size();
1061 if (row_size != _translators_.size()) return false;
1062
1063 const auto& translators = _translators_.translators();
1064 for (std::size_t i = std::size_t(0); i < row_size; ++i) {
1065 switch (translators[i]->getValType()) {
1066 case DBTranslatedValueType::DISCRETE :
1067 if ((row[i].discr_val >= translators[i]->domainSize())
1068 && (row[i].discr_val != std::numeric_limits< std::size_t >::max()))
1069 return false;
1070 break;
1071
1072 case DBTranslatedValueType::CONTINUOUS : {
1073 const IContinuousVariable& var
1074 = static_cast< const IContinuousVariable& >(*(translators[i]->variable()));
1075 if (((var.lowerBoundAsDouble() > (double)row[i].cont_val)
1076 || (var.upperBoundAsDouble() < (double)row[i].cont_val))
1077 && (row[i].cont_val != std::numeric_limits< float >::max()))
1078 return false;
1079 break;
1080 }
1081
1082 default : GUM_ERROR(NotImplementedYet, "Translated value type not supported yet")
1083 }
1084 }
1085
1086 return true;
1087 }
1088
1089 // insert a new DBRow of DBCells at the end of the database
1090 void DatabaseTable::insertRow(const typename DatabaseTable::template Row< DBCell >& new_row) {
1091 GUM_ERROR(NotImplementedYet, "not implemented yet")
1092 }
1093
1094 // insert a new DBRow of DBCells at the end of the database
1095 void DatabaseTable::insertRow(typename DatabaseTable::template Row< DBCell >&& new_row) {
1096 GUM_ERROR(NotImplementedYet, "not implemented yet")
1097 }
1098
1100 void DatabaseTable::insertRows(
1101 typename DatabaseTable::template Matrix< DBTranslatedValue >&& rows,
1102 const typename DatabaseTable::template DBVector< IsMissing >& rows_have_missing_vals) {
1103 // check that the new rows values are compatible with the values of
1104 // the variables stored within the translators
1105 for (const auto& new_row: rows) {
1106 if (!_isRowCompatible_(new_row)) {
1107 if (new_row.size() != _translators_.size()) {
1109 "The new row has " << new_row.size()
1110 << " elements whereas the database table has "
1111 << _translators_.size() << " columns")
1112 } else {
1113 GUM_ERROR(InvalidArgument, "the new row is not compatible with the current translators")
1114 }
1115 }
1116 }
1117
1118 IDatabaseTable< DBTranslatedValue >::insertRows(std::move(rows), rows_have_missing_vals);
1119 }
1120
1122 void DatabaseTable::insertRows(
1123 const typename DatabaseTable::template Matrix< DBTranslatedValue >& new_rows,
1124 const typename DatabaseTable::template DBVector< IsMissing >& rows_have_missing_vals) {
1125 // check that the new rows values are compatible with the values of
1126 // the variables stored within the translators
1127 for (const auto& new_row: new_rows) {
1128 if (!_isRowCompatible_(new_row)) {
1129 if (new_row.size() != _translators_.size()) {
1131 "The new row has " << new_row.size()
1132 << " elements whereas the database table has "
1133 << _translators_.size() << " columns")
1134 } else {
1135 GUM_ERROR(InvalidArgument, "the new row is not compatible with the current translators")
1136 }
1137 }
1138 }
1139
1140 IDatabaseTable< DBTranslatedValue >::insertRows(new_rows, rows_have_missing_vals);
1141 }
1142
1144 void DatabaseTable::insertRows(typename DatabaseTable::template Matrix< DBCell >&& new_rows) {
1145 GUM_ERROR(NotImplementedYet, "not implemented yet")
1146 }
1147
1149 void
1150 DatabaseTable::insertRows(const typename DatabaseTable::template Matrix< DBCell >& new_rows) {
1151 GUM_ERROR(NotImplementedYet, "not implemented yet")
1152 }
1153
1155 void DatabaseTable::clear() {
1156 _translators_.clear();
1157 _ignored_cols_.clear();
1158 IDatabaseTable< DBTranslatedValue >::clear();
1159 }
1160
1161} // namespace gum::learning
1162
1163#endif /* DOXYGEN_SHOULD_SKIP_THIS */
Exception: at least one argument passed to a function is not what was expected.
Error: The database contains some missing values.
Exception : there is something wrong with an implementation.
Exception : operation not allowed.
Exception : out of bound.
Exception : problem with size.
Exception : a looked-for element could not be found.
the class for packing together the translators used to preprocess the datasets
DatabaseTable(const MissingValType &missing_symbols, const DBTranslatorSet &translators=DBTranslatorSet())
default constructor
std::vector< std::string > MissingValType
The common class for the tabular database tables.
The class representing a tabular database stored in RAM.
#define GUM_ERROR(type, msg)
Definition exceptions.h:72
include the inlined functions if necessary
Definition CSVParser.h:54
STL namespace.
The union class for storing the translated values in learning databases.