aGrUM 2.3.2
a C++ library for (probabilistic) graphical models
indepTestG2.cpp
Go to the documentation of this file.
1/****************************************************************************
2 * This file is part of the aGrUM/pyAgrum library. *
3 * *
4 * Copyright (c) 2005-2025 by *
5 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
6 * - Christophe GONZALES(_at_AMU) *
7 * *
8 * The aGrUM/pyAgrum library is free software; you can redistribute it *
9 * and/or modify it under the terms of either : *
10 * *
11 * - the GNU Lesser General Public License as published by *
12 * the Free Software Foundation, either version 3 of the License, *
13 * or (at your option) any later version, *
14 * - the MIT license (MIT), *
15 * - or both in dual license, as here. *
16 * *
17 * (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) *
18 * *
19 * This aGrUM/pyAgrum library is distributed in the hope that it will be *
20 * useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, *
21 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS *
22 * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE *
23 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER *
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, *
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR *
26 * OTHER DEALINGS IN THE SOFTWARE. *
27 * *
28 * See LICENCES for more details. *
29 * *
30 * SPDX-FileCopyrightText: Copyright 2005-2025 *
31 * - Pierre-Henri WUILLEMIN(_at_LIP6) *
32 * - Christophe GONZALES(_at_AMU) *
33 * SPDX-License-Identifier: LGPL-3.0-or-later OR MIT *
34 * *
35 * Contact : info_at_agrum_dot_org *
36 * homepage : http://agrum.gitlab.io *
37 * gitlab : https://gitlab.com/agrumery/agrum *
38 * *
39 ****************************************************************************/
40
41
48
50
51#ifndef DOXYGEN_SHOULD_SKIP_THIS
52
54# ifdef GUM_NO_INLINE
56# endif /* GUM_NO_INLINE */
57
58namespace gum {
59
60 namespace learning {
61
64 if (this != &from) {
66 // __chi2 = from. _chi2_;
67 }
68 return *this;
69 }
70
73 if (this != &from) {
74 IndependenceTest::operator=(std::move(from));
75 // __chi2 = std::move(from. _chi2_);
76 }
77 return *this;
78 }
79
81 std::pair< double, double > IndepTestG2::statistics_(const IdCondSet& idset) {
82 // get the counts
83 std::vector< double > N_xyz(this->counter_.counts(idset, true));
84 const bool informative_external_prior = this->prior_->isInformative();
85 if (informative_external_prior) this->prior_->addJointPseudoCount(idset, N_xyz);
86 const std::size_t all_size = (N_xyz.size());
87
88 // compute the domain sizes of X and Y
89 const auto& nodeId2cols = this->counter_.nodeId2Columns();
90 const auto& database = this->counter_.database();
91 Idx var_x, var_y;
92 if (nodeId2cols.empty()) {
93 var_x = idset[0];
94 var_y = idset[1];
95 } else {
96 var_x = nodeId2cols.second(idset[0]);
97 var_y = nodeId2cols.second(idset[1]);
98 }
99
100 const std::size_t X_size = database.domainSize(var_x);
101 const std::size_t Y_size = database.domainSize(var_y);
102
103 double cumulStat = 0.0;
104
105 // here, we distinguish idsets with conditioning nodes from those
106 // without conditioning nodes
107 if (idset.hasConditioningSet()) {
108 const std::size_t Z_size = all_size / (X_size * Y_size);
109
110 // get the counts for the conditioning nodes
111 std::vector< double > N_xz
112 = this->marginalize_(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
113 std::vector< double > N_yz
114 = this->marginalize_(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
115 std::vector< double > N_z
116 = this->marginalize_(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
117
118 // indicate to the chi2 distribution the set of conditioning nodes
119 std::vector< Idx > cond_nodes;
120 cond_nodes.reserve(idset.nbRHSIds());
121 {
122 const auto cond_idset = idset.conditionalIdCondSet().ids();
123 if (nodeId2cols.empty()) {
124 for (const auto node: cond_idset)
125 cond_nodes.push_back(node);
126 } else {
127 for (const auto node: cond_idset)
128 cond_nodes.push_back(nodeId2cols.second(node));
129 }
130 }
131 _chi2_.setConditioningNodes(cond_nodes);
132
133
134 // now, perform :
135 // sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
136 for (std::size_t z = std::size_t(0),
137 beg_xz = std::size_t(0),
138 beg_yz = std::size_t(0),
139 xyz = std::size_t(0);
140 z < Z_size;
141 ++z, beg_xz += X_size, beg_yz += Y_size) {
142 if (N_z[z] > 0) {
143 for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size; ++yz, ++y) {
144 for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size; ++xz, ++x, ++xyz) {
145 const double tmp1 = N_xyz[xyz] * N_z[z];
146 const double tmp2 = N_yz[yz] * N_xz[xz];
147 if ((tmp1 != 0.0) && (tmp2 != 0.0)) {
148 cumulStat += N_xyz[xyz] * std::log(tmp1 / tmp2);
149 }
150 }
151 }
152 } else { // moving xyz out of the loops x,y when if N_z[z]==0
153 xyz += X_size * Y_size;
154 }
155 }
156 } else {
157 // here, there is no conditioning set
158
159 // indicate to the chi2 distribution the set of conditioning nodes
160 _chi2_.setConditioningNodes(_empty_set_);
161
162 // now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
163
164 // get the counts for all the targets and for the conditioning nodes
165 std::vector< double > N_x
166 = this->marginalize_(std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
167 std::vector< double > N_y
168 = this->marginalize_(std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
169
170 // count N
171 double N = 0.0;
172 for (auto n_x: N_x)
173 N += n_x;
174
175 for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
176 const double tmp_Ny = N_y[y];
177 for (std::size_t x = 0; x < X_size; ++x, ++xy) {
178 const double tmp = (tmp_Ny * N_x[x]);
179 if ((tmp != 0.0) && (N_xyz[xy] != 0.0)) {
180 cumulStat += N_xyz[xy] * std::log((N_xyz[xy] * N) / tmp);
181 }
182 }
183 }
184 }
185
186 // used to make the G test formula asymptotically equivalent
187 // to the Pearson's chi-squared test formula
188 cumulStat *= 2;
189
190 Size df = _chi2_.degreesOfFreedom(var_x, var_y);
191 double pValue = _chi2_.probaChi2(cumulStat, df);
192 return std::pair< double, double >(cumulStat, pValue);
193 }
194
196 double IndepTestG2::score_(const IdCondSet& idset) {
197 // compute the domain sizes of X and Y
198 const auto& nodeId2cols = this->counter_.nodeId2Columns();
199 Idx var_x, var_y;
200 if (nodeId2cols.empty()) {
201 var_x = idset[0];
202 var_y = idset[1];
203 } else {
204 var_x = nodeId2cols.second(idset[0]);
205 var_y = nodeId2cols.second(idset[1]);
206 }
207
208 auto stat = statistics_(idset);
209 double score = stat.first;
210
211 // ok, here, score contains the value of the chi2 formula.
212 // To get a meaningful score, we shall compute the critical values
213 // for the Chi2 distribution and assign as the score of
214 // (score - alpha ) / alpha, where alpha is the critical value
215 const double alpha = _chi2_.criticalValue(var_x, var_y);
216 score = (score - alpha) / alpha;
217
218 return score;
219 }
220
221 } /* namespace learning */
222
223} /* namespace gum */
224
225#endif /* DOXYGEN_SHOULD_SKIP_THIS */
A class for storing a pair of sets of NodeIds, the second one corresponding to a conditional set.
Definition idCondSet.h:214
the class for computing G2 independence test scores
Definition indepTestG2.h:67
IndepTestG2 & operator=(const IndepTestG2 &from)
copy operator
virtual double score_(const IdCondSet &idset) final
returns the score for a given IdCondSet
std::pair< double, double > statistics_(const IdCondSet &idset)
compute the pair <G2 statistic,pvalue>
std::vector< double > marginalize_(const std::size_t node_2_marginalize, const std::size_t X_size, const std::size_t Y_size, const std::size_t Z_size, const std::vector< double > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
double score(const NodeId var1, const NodeId var2)
returns the score of a pair of nodes
RecordCounter counter_
the record counter used for the counts over discrete variables
IndependenceTest & operator=(const IndependenceTest &from)
copy operator
Prior * prior_
the expert knowledge a priorwe add to the contingency tables
const DatabaseTable & database() const
return the database used by the score
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Definition types.h:74
Size Idx
Type for indexes.
Definition types.h:79
the class for computing G2 scores
the class for computing G2 scores
include the inlined functions if necessary
Definition CSVParser.h:54
gum is the global namespace for all aGrUM entities
Definition agrum.h:46