aGrUM 2.3.2
a C++ library for (probabilistic) graphical models
rawDatabaseTable.cpp
Go to the documentation of this file.
1/****************************************************************************
2 * This file is part of the aGrUM/pyAgrum library. *
3 * *
4 * Copyright (c) 2005-2025 by *
5 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
6 * - Christophe GONZALES(_at_AMU) *
7 * *
8 * The aGrUM/pyAgrum library is free software; you can redistribute it *
9 * and/or modify it under the terms of either : *
10 * *
11 * - the GNU Lesser General Public License as published by *
12 * the Free Software Foundation, either version 3 of the License, *
13 * or (at your option) any later version, *
14 * - the MIT license (MIT), *
15 * - or both in dual license, as here. *
16 * *
17 * (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) *
18 * *
19 * This aGrUM/pyAgrum library is distributed in the hope that it will be *
20 * useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, *
21 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS *
22 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE *
23 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
26 * OTHER DEALINGS IN THE SOFTWARE. *
27 * *
28 * See LICENCES for more details. *
29 * *
30 * SPDX-FileCopyrightText: Copyright 2005-2025 *
31 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
32 * - Christophe GONZALES(_at_AMU) *
33 * SPDX-License-Identifier: LGPL-3.0-or-later OR MIT *
34 * *
35 * Contact : info_at_agrum_dot_org *
36 * homepage : http://agrum.gitlab.io *
37 * gitlab : https://gitlab.com/agrumery/agrum *
38 * *
39 ****************************************************************************/
40
41
47
49
50#ifndef DOXYGEN_SHOULD_SKIP_THIS
51
53# ifdef GUM_NO_INLINE
55# endif /* GUM_NO_INLINE */
56
57namespace gum {
58
59 namespace learning {
60
61 // virtual copy constructor
62 RawDatabaseTable* RawDatabaseTable::clone() const { return new RawDatabaseTable(*this); }
63
64 // copy operator
66 if (this != &from) {
68 _ignored_cols_ = from._ignored_cols_;
69 }
70 return *this;
71 }
72
73 // move constructor
75 if (this != &from) {
77 _ignored_cols_ = std::move(from._ignored_cols_);
78 }
79 return *this;
80 }
81
82 // sets the names of the variables
83 void RawDatabaseTable::setVariableNames(const std::vector< std::string >& names,
84 const bool from_external_object) {
85 const std::size_t size = names.size();
86 const std::size_t ignored_cols_size = _ignored_cols_.size();
87
88 if (!from_external_object || !ignored_cols_size) {
89 if (this->rows_.empty() || (size == this->rows_[0].size())) {
90 this->variable_names_ = names;
91 } else {
92 GUM_ERROR(SizeError,
93 "the number of variable's names (i.e., "
94 << size << ") does not correspond to the number of columns of the "
95 << "raw database table (i.e.," << this->rows_[0].size() << ")");
96 }
97 } else {
98 // check that the size of the names vector (after removing the ignored
99 // columns) is the same as the rest of the database
100 std::size_t ignored_size = std::size_t(0);
101
102 // find the number of ignored cols
103 for (auto iter = _ignored_cols_.rbegin(), rend = _ignored_cols_.rend(); iter != rend;
104 ++iter, ++ignored_size) {
105 if (*iter < size) { break; }
106 }
107 ignored_size = ignored_cols_size - ignored_size;
108
109 if (this->rows_.empty() || (size == this->rows_[0].size() + ignored_size)) {
110 DBVector< std::string > new_names;
111 for (std::size_t i = std::size_t(0), j = std::size_t(0); i < size; ++i) {
112 if (i != _ignored_cols_[j]) {
113 new_names.push_back(names[i]);
114 } else {
115 if (++j == ignored_cols_size) {
116 for (++i; i < size; ++i) {
117 new_names.push_back(names[i]);
118 }
119 }
120 }
121 }
122 this->variable_names_ = std::move(new_names);
123 return;
124 } else {
125 GUM_ERROR(SizeError,
126 "the number of variable's names excluding the ignored "
127 << "columns (i.e., " << (size - ignored_size)
128 << ") does not correspond to the number of columns of the "
129 << "raw database table (i.e.," << this->rows_[0].size() << ")");
130 }
131 }
132 }
133
135 void RawDatabaseTable::ignoreColumn(const std::size_t k, const bool from_external_object) {
136 // first, compute the value that k would have in an external database
137 // and compute where the new value should be inserted
138 std::size_t i; // where to insert the new k into the ignored colums
139 std::size_t kk = k; // kk = k value for an external database
140 const std::size_t size = _ignored_cols_.size();
141
142 if (from_external_object) {
143 for (i = std::size_t(0); i < size; ++i) {
144 if (k <= _ignored_cols_[i]) {
145 if (k == _ignored_cols_[i]) return;
146 break;
147 }
148 }
149 } else {
150 for (i = std::size_t(0); i < size; ++i, ++kk) {
151 if (kk <= _ignored_cols_[i]) {
152 if (kk == _ignored_cols_[i]) return;
153 break;
154 }
155 }
156 }
157
158 // the column of _rows_ and variable_names_ impacted by the ignoreColumn
159 // operation is therefore equal to kk-i. So, we should check that such
160 // a column exists and, if so, we should remove the column from _rows_
161 // and from variable_names_. Note that if there is no more variable,
162 // _rows_ should become empty
163 const std::size_t col = kk - i;
164 if (col < this->variable_names_.size()) {
165 this->variable_names_.erase(this->variable_names_.begin() + col);
166 if (this->variable_names_.empty()) {
168 } else {
169 const std::size_t nb_rows = this->rows_.size();
170 if (nb_rows != std::size_t(0)) {
171 const std::size_t nb_cols = this->rows_[0].size();
172 for (std::size_t i = std::size_t(0); i < nb_rows; ++i) {
173 auto& row = this->rows_[i].row();
174 if (this->has_row_missing_val_[i] == IsMissing::True) {
175 bool has_missing_val = false;
176 for (std::size_t j = std::size_t(0); j < nb_cols; ++j) {
177 if ((j != col) && row[j].isMissing()) {
178 has_missing_val = true;
179 break;
180 }
181 }
182 if (!has_missing_val) this->has_row_missing_val_[i] = IsMissing::False;
183 }
184 row.erase(row.begin() + col);
185 }
186 }
187 }
188 }
189
190 // here, we know that we should insert kk at the ith index of _ignored_cols_
191 _ignored_cols_.push_back(std::size_t(0));
192 for (std::size_t j = size; j > i; --j)
193 _ignored_cols_[j] = _ignored_cols_[j - 1];
194 _ignored_cols_[i] = kk;
195 }
196
198 const typename RawDatabaseTable::template DBVector< std::size_t >
200 const auto& data = IDatabaseTable< DBCell >::content();
201 if (data.empty()) { return DBVector< std::size_t >(); }
202
203 const std::size_t size = data[0].size();
204 const std::size_t ignored_cols_size = _ignored_cols_.size();
206
207 if (!ignored_cols_size) {
208 for (std::size_t i = std::size_t(0); i < size; ++i) {
209 cols[i] = i;
210 }
211 } else {
212 // fill the cols vector with consecutive values, excluding the
213 // ignored columns
214 std::size_t i = std::size_t(0); // the consecutive values
215 std::size_t k = std::size_t(0); // the index in col where we save values
216 std::size_t j = std::size_t(0); // the index to parse the ignored columns
217 while (true) {
218 if (i != _ignored_cols_[j]) {
219 cols[k] = i;
220 if (++k == size) break;
221 } else {
222 if (++j == ignored_cols_size) {
223 for (++i; k < size; ++i, ++k) {
224 cols[k] = i;
225 }
226 break;
227 }
228 }
229 ++i;
230 }
231 }
232
233 return cols;
234 }
235
236 // insert a new row at the end of the database
237 void RawDatabaseTable::insertRow(const std::vector< std::string >& new_row) {
238 // check that the size of the row (after removing the ignored columns) is
239 // the same as the rest of the database
240 const std::size_t row_size = new_row.size();
241 const std::size_t ignored_cols_size = _ignored_cols_.size();
242 std::size_t ignored_size = std::size_t(0);
243 if (ignored_cols_size) {
244 // find the number of ignored cols
245 for (auto iter = _ignored_cols_.rbegin(), rend = _ignored_cols_.rend(); iter != rend;
246 ++iter, ++ignored_size) {
247 if (*iter < row_size) { break; }
248 }
249 ignored_size = ignored_cols_size - ignored_size;
250 }
251
252 if (!this->isRowSizeOK_(row_size - ignored_size)) {
253 GUM_ERROR(SizeError,
254 "the new row has " << (row_size - ignored_size)
255 << " elements whereas the raw database table has "
256 << this->variable_names_.size() << " columns");
257 }
258
259 // create the dbrow that will contain the new data
260 Row< DBCell > dbrow;
261 dbrow.reserve(row_size - ignored_size);
262 bool has_missing_val = false;
263
264 // translate the row into T_data and put them into the newly created dbrow
265 if (ignored_size == 0) {
266 for (const auto& elt: new_row) {
267 const DBCell new_cell(this->_convert_(elt));
268 if (new_cell.isMissing()) has_missing_val = true;
269 dbrow.pushBack(new_cell);
270 }
271 } else {
272 for (std::size_t i = std::size_t(0), j = std::size_t(0); i < row_size; ++i) {
273 if (i != _ignored_cols_[j]) {
274 const DBCell new_cell(this->_convert_(new_row[i]));
275 if (new_cell.isMissing()) has_missing_val = true;
276 dbrow.pushBack(new_cell);
277 } else {
278 if (++j == ignored_size) {
279 for (++i; i < row_size; ++i) {
280 const DBCell new_cell(this->_convert_(new_row[i]));
281 if (new_cell.isMissing()) has_missing_val = true;
282 dbrow.pushBack(new_cell);
283 }
284 }
285 }
286 }
287 }
288
290 has_missing_val ? IsMissing::True : IsMissing::False);
291 }
292
293 // erase the content of the database, including the names of the variables
295 _ignored_cols_.clear();
297 }
298
299
300 } /* namespace learning */
301
302} /* namespace gum */
303
304#endif /* DOXYGEN_SHOULD_SKIP_THIS */
IDatabaseTable< T_DATA > & operator=(const IDatabaseTable< T_DATA > &from)
copy operator
virtual void clear()
erase the content of the database, including the names of the variables
const Matrix< T_DATA > & content() const noexcept
returns the content (the records) of the database
std::size_t size() const noexcept
virtual void insertRow(const std::vector< std::string > &new_row)=0
insert a new row at the end of the database
void eraseAllRows()
erase all the rows
bool isRowSizeOK_(const std::size_t size) const
The table containing the raw/original data of a database.
virtual void clear() final
erase the content of the database, including the names of the variables
DBRow< TX_DATA > Row
a row of the database
virtual RawDatabaseTable * clone() const final
virtual copy constructor
void insertRow(const std::vector< std::string > &new_row) final
insert a new row at the end of the database
RawDatabaseTable()
default constructor
void ignoreColumn(const std::size_t k, const bool from_external_object=true) final
makes the database table ignore from now on the kth column
void setVariableNames(const std::vector< std::string > &names, const bool from_external_object=true) final
sets the names of the variables
std::vector< TX_DATA > DBVector
the type for the vectors used in the RawDatabaseTable
const DBVector< std::size_t > inputColumns() const final
returns the set of columns of the original dataset that are present in the RawDatabaseTable
RawDatabaseTable & operator=(const RawDatabaseTable &from)
copy operator
#define GUM_ERROR(type, msg)
Definition exceptions.h:72
include the inlined functions if necessary
Definition CSVParser.h:54
gum is the global namespace for all aGrUM entities
Definition agrum.h:46
The table containing the raw/original data of a database.
The implementation of raw tabular databases stored in memory (RAM).