aGrUM 2.3.2
a C++ library for (probabilistic) graphical models
IDatabaseTable.h
Go to the documentation of this file.
1/****************************************************************************
2 * This file is part of the aGrUM/pyAgrum library. *
3 * *
4 * Copyright (c) 2005-2025 by *
5 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
6 * - Christophe GONZALES(_at_AMU) *
7 * *
8 * The aGrUM/pyAgrum library is free software; you can redistribute it *
9 * and/or modify it under the terms of either : *
10 * *
11 * - the GNU Lesser General Public License as published by *
12 * the Free Software Foundation, either version 3 of the License, *
13 * or (at your option) any later version, *
14 * - the MIT license (MIT), *
15 * - or both in dual license, as here. *
16 * *
17 * (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) *
18 * *
19 * This aGrUM/pyAgrum library is distributed in the hope that it will be *
20 * useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, *
21 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS *
22 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE *
23 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
26 * OTHER DEALINGS IN THE SOFTWARE. *
27 * *
28 * See LICENCES for more details. *
29 * *
30 * SPDX-FileCopyrightText: Copyright 2005-2025 *
31 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
32 * - Christophe GONZALES(_at_AMU) *
33 * SPDX-License-Identifier: LGPL-3.0-or-later OR MIT *
34 * *
35 * Contact : info_at_agrum_dot_org *
36 * homepage : http://agrum.gitlab.io *
37 * gitlab : https://gitlab.com/agrumery/agrum *
38 * *
39 ****************************************************************************/
40
41
50#ifndef GUM_IDATABASE_TABLE_H
51#define GUM_IDATABASE_TABLE_H
52
53#include <cstddef>
54#include <cstring>
55#include <memory>
56#include <mutex>
57#include <string>
58#include <utility>
59#include <vector>
60
61#include <agrum/agrum.h>
62
68
69namespace gum {
70
71 namespace learning {
72
73 template < bool ENABLE_INSERT >
75
76 template <>
78 template < typename TX_DATA >
79 using DBVector = std::vector< TX_DATA >;
80
81 template < typename TX_DATA >
83
84 template < typename TX_DATA >
85 using Matrix = std::vector< DBRow< TX_DATA > >;
86
87
89
91 virtual void insertRow(Row< DBCell >&& new_row) = 0;
92
94
96 virtual void insertRow(const Row< DBCell >& new_row) = 0;
97
99
101 virtual void insertRows(Matrix< DBCell >&& new_rows) = 0;
102
104
106 virtual void insertRows(const Matrix< DBCell >& new_rows) = 0;
107
109
111 virtual void insertRow(const std::vector< std::string >& new_row) = 0;
112
114
116 virtual void insertRows(const DBVector< DBVector< std::string > >& new_rows);
117 };
118
119 template <>
121 template < typename TX_DATA >
122 using DBVector = std::vector< TX_DATA >;
123
124 template < typename TX_DATA >
126
127 template < typename TX_DATA >
128 using Matrix = std::vector< DBRow< TX_DATA > >;
129
131
133 virtual void insertRow(const std::vector< std::string >& new_row) = 0;
134
136
138 virtual void insertRows(const DBVector< DBVector< std::string > >& new_rows);
139 };
140
267 template < typename T_DATA >
269 public IDatabaseTableInsert4DBCell< !std::is_same< T_DATA, DBCell >::value > {
270 public:
272 template < typename TX_DATA >
273 using DBVector = std::vector< TX_DATA >;
274
276 template < typename TX_DATA >
278
280 template < typename TX_DATA >
281 using Matrix = std::vector< DBRow< TX_DATA > >;
282
283 using MissingValType = std::vector< std::string >;
284
285 enum IsMissing : char { False, True };
286
378 class Handler: public DBHandler< T_DATA > {
379 public:
382 using iterator_category = std::random_access_iterator_tag;
387 using const_pointer = const value_type*;
388 using difference_type = std::ptrdiff_t;
390
391 template < typename TX_DATA >
392 using DBVector = std::vector< TX_DATA >;
393
394 template < typename TX_DATA >
396
397 template < typename TX_DATA >
398 using Matrix = std::vector< DBRow< TX_DATA > >;
399
400
401 // ########################################################################
403 // ########################################################################
405
407
410
412
413 Handler(const Handler& h);
414
416
418
420 virtual ~Handler();
421
423
424 // ########################################################################
426 // ########################################################################
428
430 virtual Handler& operator=(const Handler&);
431
434
436
438 virtual Handler& operator++() final;
439
441
444 virtual Handler& operator--() final;
445
447
450 virtual Handler& operator+=(const std::size_t i) final;
451
453
456 virtual Handler& operator-=(const std::size_t i) final;
457
459 virtual bool operator==(const Handler& handler) const final;
460
462 virtual bool operator!=(const Handler& handler) const final;
463
465
469 virtual const_reference operator*() const final;
470
472
476 virtual const_pointer operator->() const final;
477
479
480
481 // ########################################################################
483 // ########################################################################
485
487
494 virtual std::size_t size() const final;
495
497 virtual std::size_t DBSize() const final;
498
500
501 virtual const_reference rowSafe() const final;
502
504
505 virtual reference rowSafe() final;
506
508
512 virtual const_reference row() const final;
513
515
519 virtual reference row() final;
520
522 virtual void nextRow() final;
523
525 virtual std::size_t numRow() const final;
526
528 virtual bool hasRows() const final;
529
531 virtual void reset() final;
532
538 virtual Handler begin() const;
539
545 virtual Handler end() const;
546
548
558 virtual void setRange(std::size_t first, std::size_t last) final;
559
561 virtual std::pair< std::size_t, std::size_t > range() const final;
562
564 virtual const DBVector< std::string >& variableNames() const final;
565
567 virtual std::size_t nbVariables() const final;
568
570
572 virtual const IDatabaseTable< T_DATA >& database() const;
573
575
576
577#ifndef DOXYGEN_SHOULD_SKIP_THIS
578
579 protected:
581 const IDatabaseTable< T_DATA >* _db_;
582
584
586 const Matrix< T_DATA >* _row_;
587
589 std::size_t _index_{std::size_t(0)};
590
592 std::size_t _begin_index_{std::size_t(0)};
593
595 std::size_t _end_index_{std::size_t(0)};
596
597 friend class IDatabaseTable< T_DATA >;
598
599#endif /* DOXYGEN_SHOULD_SKIP_THIS */
600 };
601
693 class HandlerSafe final: public Handler {
694 public:
697 using iterator_category = std::random_access_iterator_tag;
702 using const_pointer = const value_type*;
703 using difference_type = std::ptrdiff_t;
705
706 // ########################################################################
708 // ########################################################################
710
712
715
718
721
723 virtual ~HandlerSafe();
724
726
727 // ########################################################################
729 // ########################################################################
731
734
736 virtual HandlerSafe& operator=(const Handler&);
737
740
743
745
746
747#ifndef DOXYGEN_SHOULD_SKIP_THIS
748
749 private:
751 void _attachHandler_();
752
754 void _detachHandler_();
755
756 friend class IDatabaseTable< T_DATA >;
757
758#endif /* DOXYGEN_SHOULD_SKIP_THIS */
759 };
760
767 using const_pointer = const value_type*;
768 using size_type = std::size_t;
769 using difference_type = std::ptrdiff_t;
772 using const_iterator = const Handler;
775
776
777 // ##########################################################################
779 // ##########################################################################
781
783 IDatabaseTable(const MissingValType& missing_symbols,
784 const std::vector< std::string >& var_names);
785
788
791
793 virtual IDatabaseTable< T_DATA >* clone() const = 0;
794
797
799
800
801 // ##########################################################################
803 // ##########################################################################
805
808
811
813 const iterator& end() const noexcept;
814
816 const iterator_safe& endSafe() const noexcept;
817
819
820
821 // ##########################################################################
823 // ##########################################################################
825
827 const Matrix< T_DATA >& content() const noexcept;
828
831
834
836
837 const DBVector< std::string >& variableNames() const noexcept;
838
840
865 virtual void setVariableNames(const std::vector< std::string >& names,
866 const bool from_external_object = true)
867 = 0;
868
870
872 const std::string& variableName(const std::size_t k) const;
873
875
880 std::size_t columnFromVariableName(const std::string& name) const;
881
883
886 DBVector< std::size_t > columnsFromVariableName(const std::string& name) const;
887
889 std::size_t nbVariables() const noexcept;
890
892 std::size_t nbRows() const noexcept;
893
895 std::size_t size() const noexcept;
896
898 bool empty() const noexcept;
899
901
930 virtual void ignoreColumn(const std::size_t k, const bool from_external_object = true) = 0;
931
933 virtual const DBVector< std::size_t > ignoredColumns() const = 0;
934
937 virtual const DBVector< std::size_t > inputColumns() const = 0;
938
939 using IDatabaseTableInsert4DBCell< !std::is_same< T_DATA, DBCell >::value >::insertRow;
940
942
947 virtual void insertRow(const std::vector< std::string >& new_row) = 0;
948
950
956 virtual void insertRow(Row< T_DATA >&& new_row, const IsMissing contains_missing_data);
957
959
965 virtual void insertRow(const Row< T_DATA >& new_row, const IsMissing contains_missing_data);
966
967 using IDatabaseTableInsert4DBCell< !std::is_same< T_DATA, DBCell >::value >::insertRows;
968
970
982 virtual void insertRows(Matrix< T_DATA >&& new_rows,
983 const DBVector< IsMissing >& rows_have_missing_vals);
984
986
998 virtual void insertRows(const Matrix< T_DATA >& new_rows,
999 const DBVector< IsMissing >& rows_have_missing_vals);
1000
1002
1005 void eraseRow(std::size_t index);
1006
1008
1011
1013
1016
1018
1020 void eraseFirstRows(const std::size_t k);
1021
1023
1025 void eraseLastRows(const std::size_t k);
1026
1028
1029 void eraseRows(std::size_t deb, std::size_t end);
1030
1033
1035 virtual void clear();
1036
1038 const DBVector< std::string >& missingSymbols() const;
1039
1041 bool hasMissingValues() const;
1042
1044 bool hasMissingValues(const std::size_t k) const;
1045
1047
1050 void setMaxNbThreads(const std::size_t nb) const;
1051
1053 std::size_t nbThreads() const;
1054
1064 void setMinNbRowsPerThread(const std::size_t nb) const;
1065
1067 std::size_t minNbRowsPerThread() const;
1068
1070 void setAllRowsWeight(const double new_weight);
1071
1073
1075 void setWeight(const std::size_t i, const double weight);
1076
1078
1080 double weight(const std::size_t i) const;
1081
1083 double weight() const;
1084
1086
1087
1088 protected:
1091
1092 // the vector of DBRows containing all the raw data
1093 Matrix< T_DATA > rows_;
1094
1095 // the set of string corresponding to missing values
1097
1098 // a vector indicating which rows have missing values (char != 0)
1100
1101 // the maximal number of threads that the database can use
1102 mutable std::size_t max_nb_threads_{std::size_t(gum::getNumberOfThreads())};
1103
1104 // the min number of rows that a thread should process in a
1105 // multithreading context
1106 mutable std::size_t min_nb_rows_per_thread_{100};
1107
1108
1111 bool isRowSizeOK_(const std::size_t size) const;
1112
1114 std::size_t nbProcessingThreads_() const;
1115
1117 std::vector< std::pair< std::size_t, std::size_t > >
1118 rangesProcessingThreads_(const std::size_t nb_threads) const;
1119
1122
1125
1126
1127#ifndef DOXYGEN_SHOULD_SKIP_THIS
1128
1129 private:
1130 // the list of handlers currently attached to the database
1131 /* this is useful when the database is resized */
1132 mutable DBVector< HandlerSafe* > _list_of_safe_handlers_;
1133
1134 // a mutex to safely add/remove handlers in _list_of_safe_handlers_
1135 mutable std::mutex _safe_handlers_mutex_;
1136
1137 // the end iterator for the database
1138 Handler* _end_{nullptr};
1139
1140 // the safe end iterator for the database
1141 iterator_safe* _end_safe_{nullptr};
1142
1144 void _attachHandler_(HandlerSafe* handler) const;
1145
1147 void _detachHandler_(HandlerSafe* handler) const;
1148
1150 void _updateHandlers_(std::size_t new_size) const;
1151
1152 // create the end iterators
1153 void _createEndIterators_();
1154
1155#endif /* DOXYGEN_SHOULD_SKIP_THIS */
1156
1157
1159 friend class Handler;
1160 friend class HandlerSafe;
1161 };
1162
1163 } /* namespace learning */
1164
1165} /* namespace gum */
1166
1169
1170#endif /* GUM_IDATABASE_TABLE_H */
The class representing the original values of the cells of databases.
The base class for all database handlers.
The base class for all the tabular databases' cell translators.
The implementation of the common class for tabular databases.
The class representing the original values of the cells of databases.
Definition DBCell.h:93
The base class for all database handlers.
Definition DBHandler.h:140
DBRow< T_DATA > value_type
Types for STL compliance.
Definition DBHandler.h:145
the safe handler of the tabular databases
std::ptrdiff_t difference_type
Types for STL compliance.
virtual HandlerSafe & operator=(HandlerSafe &&)
move operator
value_type * pointer
Types for STL compliance.
HandlerSafe(const IDatabaseTable< T_DATA > &db)
default constructor
virtual HandlerSafe & operator=(const Handler &)
copy operator
virtual HandlerSafe & operator=(const HandlerSafe &)
copy operator
const value_type * const_pointer
Types for STL compliance.
virtual HandlerSafe & operator=(Handler &&)
move operator
std::random_access_iterator_tag iterator_category
Types for STL compliance.
value_type & reference
Types for STL compliance.
HandlerSafe(HandlerSafe &&h)
move constructor
const value_type & const_reference
Types for STL compliance.
HandlerSafe(const HandlerSafe &h)
copy constructor
typename Handler::value_type value_type
Types for STL compliance.
the (unsafe) handler for the tabular databases
virtual Handler & operator++() final
makes the operator point to the next row in the database
const value_type * const_pointer
Types for STL compliance.
virtual const DBVector< std::string > & variableNames() const final
returns the names of the variables
virtual Handler begin() const
returns a new handler that points to the beginning of the database's area of the current handler
Handler(const IDatabaseTable< T_DATA > &db)
default constructor
const value_type & const_reference
Types for STL compliance.
value_type & reference
Types for STL compliance.
virtual void setRange(std::size_t first, std::size_t last) final
sets the area in the database the handler will handle
virtual bool hasRows() const final
indicates whether the handler has reached its end or not
virtual Handler end() const
returns a new handler that points to the end of the database's area of the current handler
virtual Handler & operator=(Handler &&)
move operator
virtual std::size_t size() const final
returns the number of rows managed by the handler
virtual const_reference row() const final
returns the current row pointed to by the handler (unsafe version)
virtual void reset() final
puts the handler to the beginning of the database's area it handles
virtual const IDatabaseTable< T_DATA > & database() const
returns a pointer on the database
virtual Handler & operator=(const Handler &)
copy operator
typename DBHandler< T_DATA >::value_type value_type
Types for STL compliance.
virtual std::size_t nbVariables() const final
returns the number of variables (columns) of the database
std::ptrdiff_t difference_type
Types for STL compliance.
virtual std::size_t numRow() const final
the number of the current row (0 = the 1st row managed by the handler)
std::random_access_iterator_tag iterator_category
Types for STL compliance.
virtual const_reference rowSafe() const final
returns the current row pointed to by the handler (safe version)
virtual std::pair< std::size_t, std::size_t > range() const final
returns the current range of the handler [begin,end)
Handler(Handler &&h)
move constructor
Handler(const Handler &h)
copy constructor
virtual void nextRow() final
makes the handler point to the next row, equivalent to operator++
std::vector< DBRow< TX_DATA > > Matrix
virtual std::size_t DBSize() const final
returns the number of rows of the whole database
value_type * pointer
Types for STL compliance.
const iterator & end() const noexcept
returns a new unsafe handler pointing to the end of the database
void setAllRowsWeight(const double new_weight)
assign a given weight to all the rows of the database
IDatabaseTable(IDatabaseTable< T_DATA > &&from)
move constructor
IDatabaseTable< T_DATA > & operator=(const IDatabaseTable< T_DATA > &from)
copy operator
value_type & reference
Types for STL compliance.
std::size_t nbProcessingThreads_() const
returns the number of threads used to process the current database content
const Handler const_iterator
Types for STL compliance.
DBVector< IsMissing > has_row_missing_val_
IDatabaseTable< T_DATA > & operator=(IDatabaseTable< T_DATA > &&from)
move operator
const iterator_safe & endSafe() const noexcept
returns a new safe handler pointing to the end of the database
std::size_t nbRows() const noexcept
returns the number of records (rows) in the database
IDatabaseTable(const MissingValType &missing_symbols, const std::vector< std::string > &var_names)
default constructor
DBVector< std::string > variable_names_
the names of the variables for each column
bool empty() const noexcept
indicates whether the database contains some records or not
void eraseRow(std::size_t index)
erase a given row specified by its index in the table
std::ptrdiff_t difference_type
Types for STL compliance.
virtual const DBVector< std::size_t > ignoredColumns() const =0
returns the set of columns of the original dataset that are ignored
void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context
void eraseFirstRows(const std::size_t k)
erase the k first rows
DBVector< std::string > missing_symbols_
virtual void clear()
erase the content of the database, including the names of the variables
void eraseLastRow()
erase the last row
const DBVector< std::string > & missingSymbols() const
returns the set of missing symbols
IDatabaseTable(const IDatabaseTable< T_DATA > &from)
copy constructor
std::vector< std::pair< std::size_t, std::size_t > > rangesProcessingThreads_(const std::size_t nb_threads) const
returns the ranges that threads should process
std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
std::vector< std::string > MissingValType
const Matrix< T_DATA > & content() const noexcept
returns the content (the records) of the database
const DBVector< std::string > & variableNames() const noexcept
returns the variable names for all the columns of the database
std::size_t columnFromVariableName(const std::string &name) const
returns the index of the column whose name is passed in argument
Row< T_DATA > value_type
Types for STL compliance.
const HandlerSafe const_iterator_safe
Types for STL compliance.
void eraseLastRows(const std::size_t k)
erase the k last rows
iterator begin() const
returns a new unsafe handler pointing to the beginning of the database
void eraseRows(std::size_t deb, std::size_t end)
erase the rows from the debth to the endth (not included)
virtual void setVariableNames(const std::vector< std::string > &names, const bool from_external_object=true)=0
sets the names of the variables
iterator handler() const
returns a new unsafe handler pointing to the 1st record of the database
DBVector< std::size_t > columnsFromVariableName(const std::string &name) const
returns the indices of all the columns whose name is passed in argument
virtual IDatabaseTable< T_DATA > * clone() const =0
virtual copy constructor
const value_type * const_pointer
Types for STL compliance.
virtual void ignoreColumn(const std::size_t k, const bool from_external_object=true)=0
makes the database table ignore from now on the kth column
void setMaxNbThreads(const std::size_t nb) const
changes the max number of threads that a database can use
double weight(const std::size_t i) const
returns the weight of the ith record
std::size_t size_type
Types for STL compliance.
std::size_t size() const noexcept
returns the number of records (rows) in the database
virtual void insertRow(const std::vector< std::string > &new_row)=0
insert a new row at the end of the database
bool hasMissingValues() const
indicates whether the database contains some missing values
std::vector< DBRow< TX_DATA > > Matrix
the type for the matrices stored into the database
value_type * pointer
Types for STL compliance.
void eraseAllRows()
erase all the rows
std::size_t nbThreads() const
returns the number of threads used to parse the database
iterator_safe handlerSafe() const
returns a new safe handler pointing to the 1st record of the database
const std::string & variableName(const std::size_t k) const
returns the name of the kth column of the IDatabaseTable
std::size_t nbVariables() const noexcept
returns the number of variables (columns) of the database
void eraseFirstRow()
erase the first row
void setWeight(const std::size_t i, const double weight)
assigns a given weight to the ith row of the database
std::vector< TX_DATA > DBVector
the type for the vectors used in the IDatabaseTable
Handler iterator
Types for STL compliance.
const value_type & const_reference
Types for STL compliance.
virtual ~IDatabaseTable()
destructor
DBRow< TX_DATA > Row
a row of the database
virtual void insertRows(Matrix< T_DATA > &&new_rows, const DBVector< IsMissing > &rows_have_missing_vals)
insert a set of new DBRows at the end of the database
bool isRowSizeOK_(const std::size_t size) const
checks whether a size corresponds to the number of columns of the database
iterator_safe beginSafe() const
returns a new safe handler pointing to the beginning of the database
virtual const DBVector< std::size_t > inputColumns() const =0
returns the set of columns of the original dataset that are present in the IDatabaseTable
HandlerSafe iterator_safe
Types for STL compliance.
include the inlined functions if necessary
Definition CSVParser.h:54
gum is the global namespace for all aGrUM entities
Definition agrum.h:46
unsigned int getNumberOfThreads()
returns the max number of threads used by default when entering the next parallel region
STL namespace.
virtual void insertRows(const DBVector< DBVector< std::string > > &new_rows)
insert new rows at the end of the database
virtual void insertRow(const std::vector< std::string > &new_row)=0
insert a new row at the end of the database
virtual void insertRows(const Matrix< DBCell > &new_rows)=0
insert a set of new DBRows at the end of the database
virtual void insertRow(const std::vector< std::string > &new_row)=0
insert a new row at the end of the database
virtual void insertRow(Row< DBCell > &&new_row)=0
insert a new DBRow at the end of the database
virtual void insertRows(Matrix< DBCell > &&new_rows)=0
insert a set of new DBRows at the end of the database
virtual void insertRows(const DBVector< DBVector< std::string > > &new_rows)
insert new rows at the end of the database
virtual void insertRow(const Row< DBCell > &new_row)=0
insert a new row at the end of the database
The class to use to execute a function by several threads.
Utility functions used for exploiting OpenMP/STL parallelism.