52#ifndef GUM_LEARNING_GENERIC_BN_LEARNER_H
53#define GUM_LEARNING_GENERIC_BN_LEARNER_H
141 explicit Database(
const std::string& file,
142 const std::vector< std::string >& missing_symbols,
143 const bool induceTypes =
false);
162 Database(
const std::string& filename,
164 const std::vector< std::string >& missing_symbols);
173 template <
typename GUM_SCALAR >
176 const std::vector< std::string >& missing_symbols);
211 const std::vector< std::size_t >&
domainSizes()
const;
214 const std::vector< std::string >&
names()
const;
236 std::size_t
nbRows()
const;
239 std::size_t
size()
const;
250 double weight(
const std::size_t i)
const;
279 template <
typename GUM_SCALAR >
280 BayesNet< GUM_SCALAR >
_BNVars_()
const;
301 const std::vector< std::string >& missingSymbols,
302 bool induceTypes =
true);
323 template <
typename GUM_SCALAR >
326 const std::vector< std::string >& missing_symbols);
371 const std::vector< std::string >&
names()
const;
374 const std::vector< std::size_t >&
domainSizes()
const;
416 void useDatabaseRanges(
const std::vector< std::pair< std::size_t, std::size_t > >& new_ranges);
425 const std::vector< std::pair< std::size_t, std::size_t > >&
databaseRanges()
const;
449 const std::size_t k_fold);
459 std::pair< double, double >
468 std::pair< double, double >
chi2(
const std::string& name1,
469 const std::string& name2,
470 const std::vector< std::string >& knowing = {});
479 std::pair< double, double >
480 G2(NodeId id1, NodeId id2,
const std::vector< NodeId >& knowing = {});
488 std::pair< double, double >
G2(
const std::string& name1,
489 const std::string& name2,
490 const std::vector< std::string >& knowing = {});
500 const std::vector< NodeId >& knowing = {});
510 const std::vector< std::string >& knowing = {});
537 const std::string& var2,
538 const std::vector< std::string >& knowing = {});
555 const std::vector< NodeId >& knowing = {});
572 const std::string& var2,
573 const std::vector< std::string >& knowing = {});
583 double score(
NodeId vars,
const std::vector< NodeId >& knowing = {});
594 double score(
const std::string& vars,
const std::vector< std::string >& knowing = {});
601 std::vector< double >
rawPseudoCount(
const std::vector< NodeId >& vars);
608 std::vector< double >
rawPseudoCount(
const std::vector< std::string >& vars);
686 EMApproximationScheme&
EM();
768 void useK2(
const Sequence< NodeId >& order);
771 void useK2(
const std::vector< NodeId >& order);
830 void setSliceOrder(
const std::vector< std::vector< std::string > >& slices);
839 void addForbiddenArc(
const std::string& tail,
const std::string& head);
856 void addMandatoryArc(
const std::string& tail,
const std::string& head);
904 void addPossibleEdge(
const std::string& tail,
const std::string& head);
1020 std::vector< std::pair< std::size_t, std::size_t > >
ranges_;
1042 const std::vector< std::string >& missing_symbols);
1055 bool take_into_account_score =
true);
1097 const std::string& message) {
1110 algoK2_.approximationScheme().setEpsilon(eps);
1123 algoK2_.approximationScheme().disableEpsilon();
1130 algoK2_.approximationScheme().enableEpsilon();
1150 algoK2_.approximationScheme().setMinEpsilonRate(rate);
1163 algoK2_.approximationScheme().disableMinEpsilonRate();
1170 algoK2_.approximationScheme().enableMinEpsilonRate();
1190 algoK2_.approximationScheme().setMaxIter(max);
1203 algoK2_.approximationScheme().disableMaxIter();
1210 algoK2_.approximationScheme().enableMaxIter();
1231 algoK2_.approximationScheme().setMaxTime(timeout);
1250 algoK2_.approximationScheme().disableMaxTime();
1256 algoK2_.approximationScheme().enableMaxTime();
1274 algoK2_.approximationScheme().setPeriodSize(p);
1289 algoK2_.approximationScheme().setVerbosity(v);
1316 const std::vector< double >&
history()
const override {
1448 return dag2BN_.stateApproximationScheme();
1467#ifndef GUM_NO_INLINE
A class that, given a structure and a parameter estimator returns a full Bayes net.
The class for initializing DatabaseTable and RawDatabaseTable instances from CSV files.
A DBRowGenerator class that returns the rows that are complete (fully observed) w....
A DBRowGenerator class that returns incomplete rows as EM would do.
A dirichlet priori: computes its N'_ijk from a database.
A pack of learning algorithms that can easily be used.
The SimpleMiic algorithm.
The base class for all directed edges.
Class representing a Bayesian network.
The base class for all undirected edges.
Exception : fatal (unknown ?) error.
Signaler1< const std::string & > onStop
Criteria messageApproximationScheme.
IApproximationSchemeConfiguration()
Class constructors.
ApproximationSchemeSTATE
The different state of an approximation scheme.
Signaler3< Size, double, double > onProgress
Progression, error and time.
Base class for mixed graphs.
Exception : operation not allowed.
Base class for partially directed acyclic graphs.
ThreadNumberManager(Size nb_threads=0)
default constructor
Base class for undirected graphs.
A class that redirects gum_signal from algorithms to the listeners of BNLearn.
A class that, given a structure and a parameter estimator returns a full Bayes net.
the class used to read a row in the database and to transform it into a set of DBRow instances that c...
The class representing a tabular database as used by learning tasks.
The greedy hill climbing learning algorithm (for directed graphs).
a helper to easily read databases
const std::vector< std::string > & missingSymbols() const
returns the set of missing symbols taken into account
Database(const std::string &file, const std::vector< std::string > &missing_symbols, const bool induceTypes=false)
default constructor
const DatabaseTable & databaseTable() const
returns the internal database table
Size _min_nb_rows_per_thread_
the minimal number of rows to parse (on average) by thread
std::size_t size() const
returns the number of records in the database
NodeId idFromName(const std::string &var_name) const
returns the node id corresponding to a variable name
std::vector< std::size_t > _domain_sizes_
the domain sizes of the variables (useful to speed-up computations)
DatabaseTable _database_
the database itself
const std::string & nameFromId(NodeId id) const
returns the variable name corresponding to a given node id
double weight(const std::size_t i) const
returns the weight of the ith record
Bijection< NodeId, std::size_t > _nodeId2cols_
a bijection assigning to each variable name its NodeId
const std::vector< std::string > & names() const
returns the names of the variables in the database
void setWeight(const std::size_t i, const double weight)
sets the weight of the ith record
const Bijection< NodeId, std::size_t > & nodeId2Columns() const
returns the mapping between node ids and their columns in the database
Database & operator=(const Database &from)
copy operator
DBRowGeneratorParser & parser()
returns the parser for the database
DBRowGeneratorParser * _parser_
the parser used for reading the database
void setDatabaseWeight(const double new_weight)
assign a weight to all the rows of the database so that the sum of their weights is equal to new_weig...
BayesNet< GUM_SCALAR > _BNVars_() const
Database(const std::string &filename, const gum::BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
constructor with a BN providing the variables of interest
Size _max_threads_number_
the max number of threads authorized
std::size_t nbRows() const
returns the number of records in the database
const std::vector< std::size_t > & domainSizes() const
returns the domain sizes of the variables
double weight() const
returns the weight of the whole database
A pack of learning algorithms that can easily be used.
StructuralConstraintPossibleEdges constraintPossibleEdges_
the constraint on possible Edges
Size EMnbrIterations() const
returns the number of iterations performed by the last EM execution
StructuralConstraintNoParentNodes constraintNoParentNodes_
the constraint on no parent nodes
void eraseNoChildrenNode(NodeId node)
double recordWeight(const std::size_t i) const
returns the weight of the ith record
BNLearnerPriorType priorType_
the a priorselected for the score and parameters
double EMMaxTime() const
@brief returns EM's timeout (in milliseconds)
void disableEpsilon() override
Disable stopping criterion on epsilon.
double EMMinEpsilonRate() const
Get the value of the minimal log-likelihood evolution rate of EM.
double EMEpsilon() const
Get the value of EM's min diff epsilon.
const std::vector< std::size_t > & domainSizes() const
returns the domain sizes of the variables in the database
void EMenableEpsilon()
Enable the log-likelihood min diff stopping criterion in EM.
void useGreedyHillClimbing()
indicate that we wish to use a greedy hill climbing algorithm
void disableMinEpsilonRate() override
Disable stopping criterion on epsilon rate.
void useScoreBDeu()
indicate that we wish to use a BDeu score
ApproximationSchemeSTATE EMStateApproximationScheme() const
get the current state of EM
void addNoParentNode(NodeId node)
void setSliceOrder(const NodeProperty< NodeId > &slice_order)
sets a partial order on the nodes
bool verbosity() const override
verbosity
double minEpsilonRate() const override
Get the value of the minimal epsilon rate.
bool EMisEnabledMinEpsilonRate() const
bool isUsingEM() const
indicates whether we use EM for parameter learning
IBNLearner(const std::string &filename, const std::vector< std::string > &missingSymbols, bool induceTypes=true)
read the database file for the score / parameter estimation and var names
void setForbiddenArcs(const ArcSet &set)
assign a set of forbidden arcs
bool isEnabledMaxTime() const override
std::string priorDbname_
the filename for the Dirichlet a priori, if any
double priorWeight_
the weight of the prior
Size EMPeriodSize() const
sets the stopping criterion of EM as being the minimal difference between two consecutive log-likelih...
double maxTime() const override
returns the timeout (in seconds)
void disableMaxIter() override
Disable stopping criterion on max iterations.
double noiseEM_
the noise factor (in (0,1)) used by EM for perturbing the CPT during init
std::vector< std::pair< std::size_t, std::size_t > > ranges_
the set of rows' ranges within the database in which learning is done
void setDatabaseWeight(const double new_weight)
assign a weight to all the rows of the learning database so that the sum of their weights is equal to...
std::vector< Arc > latentVariables() const
get the list of arcs hiding latent variables
void EMdisableMaxTime()
Disable EM's timeout stopping criterion.
void clearDatabaseRanges()
reset the ranges to the one range corresponding to the whole database
Size maxIter() const override
void EMsetMaxIter(Size max)
add a max iteration stopping criterion
std::string checkScorePriorCompatibility() const
checks whether the current score and prior are compatible
void useBDeuPrior(double weight=1.0)
use the BDeu prior
void setVerbosity(bool v) override
verbosity
void setMandatoryArcs(const ArcSet &set)
assign a set of mandatory arcs
void EMdisableMaxIter()
Disable stopping criterion on max iterations.
ParamEstimatorType
an enumeration to select the type of parameter estimation we shall apply
AlgoType
an enumeration to select easily the learning algorithm to use
@ LOCAL_SEARCH_WITH_TABU_LIST
void enableMaxIter() override
Enable stopping criterion on max iterations.
ApproximationSchemeSTATE EMState() const
returns the state of the last EM algorithm executed
bool EMisEnabledMaxIter() const
void EMsetVerbosity(bool v)
sets or unsets EM's verbosity
void setMaxTime(double timeout) override
stopping criterion on timeout If the criterion was disabled it will be enabled
virtual void createPrior_()=0
create the prior used for learning
const std::string & nameFromId(NodeId id) const
returns the variable name corresponding to a given node id
double databaseWeight() const
returns the weight of the whole database
static DatabaseTable readFile_(const std::string &filename, const std::vector< std::string > &missing_symbols)
reads a file and returns a databaseVectInRam
void setMinEpsilonRate(double rate) override
Given that we approximate f(t), stopping criterion on d/dt(|f(t+1)-f(t)|) If the criterion was disabl...
bool EMisEnabledEpsilon() const
return true if EM's stopping criterion is the log-likelihood min diff
K2 algoK2_
the K2 algorithm
void EMdisableEpsilon()
Disable the min log-likelihood diff stopping criterion for EM.
void addMandatoryArc(const Arc &arc)
AlgoType selectedAlgo_
the selected learning algorithm
const std::vector< std::pair< std::size_t, std::size_t > > & databaseRanges() const
returns the current database rows' ranges used for learning
void setMaxIndegree(Size max_indegree)
sets the max indegree
void addPossibleEdge(const Edge &edge)
double logLikelihood(const std::vector< NodeId > &vars, const std::vector< NodeId > &knowing={})
Return the loglikelihood of vars in the base, conditioned by knowing for the BNLearner.
void setInitialDAG(const DAG &)
sets an initial DAG structure
void useK2(const Sequence< NodeId > &order)
indicate that we wish to use K2
Database scoreDatabase_
the database to be used by the scores and parameter estimators
const std::vector< double > & history() const override
INLINE void distributeStop(const ApproximationScheme *approximationScheme, const std::string &message)
distribute signals
bool isEnabledEpsilon() const override
ScoreType
an enumeration enabling to select easily the score we wish to use
bool useEM_
a Boolean indicating whether we should use EM for parameter learning or not
bool isScoreBased() const
indicate if the selected algorithm is score-based
void enableMaxTime() override
stopping criterion on timeout If the criterion was disabled it will be enabled
DAG2BNLearner dag2BN_
the parametric EM
ApproximationSchemeSTATE stateApproximationScheme() const override
history
Prior * prior_
the prior used
void EMenableMinEpsilonRate()
Enable the log-likelihood evolution rate stopping criterion.
bool EMVerbosity() const
returns the EM's verbosity status
void EMsetMaxTime(double timeout)
add a stopping criterion on timeout
void useDatabaseRanges(const std::vector< std::pair< std::size_t, std::size_t > > &new_ranges)
use a new set of database rows' ranges to perform learning
virtual ~IBNLearner()
destructor
std::pair< double, double > chi2(NodeId id1, NodeId id2, const std::vector< NodeId > &knowing={})
Return the <statistic,pvalue> pair for chi2 test in the database.
INLINE void distributeProgress(const ApproximationScheme *approximationScheme, Size pourcent, double error, double time)
{@ /// distribute signals
void EMdisableMinEpsilonRate()
Disable the log-likelihood evolution rate stopping criterion.
CorrectedMutualInformation * mutualInfo_
the selected correction for miic
double EMCurrentTime() const
get the current running time in second (double)
BNLearnerPriorType
an enumeration to select the prior
@ DIRICHLET_FROM_BAYESNET
@ DIRICHLET_FROM_DATABASE
void erasePossibleEdge(const Edge &edge)
bool isConstraintBased() const
indicate if the selected algorithm is constraint-based
void setMaxIter(Size max) override
stopping criterion on number of iterationsIf the criterion was disabled it will be enabled
void setNumberOfThreads(Size nb) override
sets the number max of threads that can be used
void useScoreBIC()
indicate that we wish to use a BIC score
Size nbDecreasingChanges_
StructuralConstraintNoChildrenNodes constraintNoChildrenNodes_
the constraint on no children nodes
DAG initialDAG()
returns the initial DAG structure
void setPossibleEdges(const EdgeSet &set)
assign a set of possible edges
void enableMinEpsilonRate() override
Enable stopping criterion on epsilon rate.
void useNoPrior()
use no prior
ParamEstimatorType paramEstimatorType_
the type of the parameter estimator
ScoreType scoreType_
the score selected for learning
std::pair< double, double > G2(NodeId id1, NodeId id2, const std::vector< NodeId > &knowing={})
Return the <statistic,pvalue> pair for for G2 test in the database.
const ApproximationScheme * currentAlgorithm_
bool isEnabledMaxIter() const override
Size periodSize() const override
how many samples between 2 stopping isEnableds
void eraseForbiddenArc(const Arc &arc)
void EMenableMaxIter()
Enable stopping criterion on max iterations.
void useSmoothingPrior(double weight=1)
use the prior smoothing
void disableMaxTime() override
Disable stopping criterion on timeout.
double currentTime() const override
get the current running time in second (double)
DAG learnDag_()
returns the DAG learnt
Database * priorDatabase_
the database used by the Dirichlet a priori
double mutualInformation(NodeId id1, NodeId id2, const std::vector< NodeId > &knowing={})
Return the mutual information of id1 and id2 in the base, conditioned by knowing for the BNLearner.
void EMsetMinEpsilonRate(double rate)
sets the stopping criterion of EM as being the minimal log-likelihood's evolution rate
void createScore_()
create the score used for learning
PriorType getPriorType_() const
returns the type (as a string) of a given prior
NodeId idFromName(const std::string &var_name) const
returns the node id corresponding to a variable name
void setPeriodSize(Size p) override
how many samples between 2 stopping isEnableds
void useLocalSearchWithTabuList(Size tabu_size=100, Size nb_decrease=2)
indicate that we wish to use a local search with tabu list
void useScoreK2()
indicate that we wish to use a K2 score
StructuralConstraintIndegree constraintIndegree_
the constraint for indegrees
void setEpsilon(double eps) override
Given that we approximate f(t), stopping criterion on |f(t+1)-f(t)| If the criterion was disabled it ...
PDAG learnPDAG()
learn a partial structure from a file (must have read the db before and must have selected miic)
void _setPriorWeight_(double weight)
sets the prior weight
std::string filename_
the filename database
void setPossibleSkeleton(const UndiGraph &skeleton)
assign a set of possible edges
void useEMWithRateCriterion(const double epsilon, const double noise=default_EM_noise)
use The EM algorithm to learn parameters with the rate stopping criterion
void useNMLCorrection()
indicate that we wish to use the NML correction for and MIIC
void useEM(const double epsilon, const double noise=default_EM_noise)
use The EM algorithm to learn parameters
void useEMWithDiffCriterion(const double epsilon, const double noise=default_EM_noise)
use The EM algorithm to learn parameters with the diff stopping criterion
bool hasMissingValues() const
returns true if the learner's database has missing values
void enableEpsilon() override
Enable stopping criterion on epsilon.
void forbidEM()
prevent using the EM algorithm for parameter learning
double epsilon() const override
Get the value of epsilon.
SimpleMiic algoSimpleMiic_
the MIIC algorithm
Score * score_
the score used
StructuralConstraintMandatoryArcs constraintMandatoryArcs_
the constraint on mandatory arcs
Miic algoMiic_
the Constraint MIIC algorithm
void createCorrectedMutualInformation_()
create the Corrected Mutual Information instance for Miic
EMApproximationScheme & EM()
returns the EM parameter learning approximation scheme if EM is enabled
void EMsetPeriodSize(Size p)
how many samples between 2 stoppings isEnabled
void useNoCorrection()
indicate that we wish to use the NoCorr correction for MIIC
StructuralConstraintForbiddenArcs constraintForbiddenArcs_
the constraint on forbidden arcs
void useScoreLog2Likelihood()
indicate that we wish to use a Log2Likelihood score
void setRecordWeight(const std::size_t i, const double weight)
sets the weight of the ith record of the database
GreedyHillClimbing greedyHillClimbing_
the greedy hill climbing algorithm
DAG learnDAG()
learn a structure from a file (must have read the db before)
Size EMMaxIter() const
return the max number of iterations criterion
void useMDLCorrection()
indicate that we wish to use the MDL correction for MIIC
void useDirichletPrior(const std::string &filename, double weight=1)
use the Dirichlet prior from a database
double score(NodeId vars, const std::vector< NodeId > &knowing={})
Return the value of the score currently in use by the BNLearner of a variable given a set of other va...
StructuralConstraintTabuList constraintTabuList_
the constraint for tabu lists
Size nbrIterations() const override
void addForbiddenArc(const Arc &arc)
DAG initialDag_
an initial DAG given to learners
void addNoChildrenNode(NodeId node)
IBNLearner & operator=(const IBNLearner &)
copy operator
MixedGraph prepareSimpleMiic_()
prepares the initial graph for Simple Miic
bool isEnabledMinEpsilonRate() const override
Size domainSize(NodeId var) const
learn a structure from a file (must have read the db before)
void useScoreAIC()
indicate that we wish to use an AIC score
const std::vector< std::string > & names() const
returns the names of the variables in the database
void eraseMandatoryArc(const Arc &arc)
static void isCSVFileName_(const std::string &filename)
checks whether the extension of a CSV filename is correct
MixedGraph prepareMiic_()
prepares the initial graph for miic
void useMIIC()
indicate that we wish to use MIIC
LocalSearchWithTabuList localSearchWithTabuList_
the local search with tabu list algorithm
std::pair< std::size_t, std::size_t > useCrossValidationFold(const std::size_t learning_fold, const std::size_t k_fold)
sets the ranges of rows to be used for cross-validation learning
INLINE void setCurrentApproximationScheme(const ApproximationScheme *approximationScheme)
{@ /// distribute signals
ParamEstimator * createParamEstimator_(const DBRowGeneratorParser &parser, bool take_into_account_score=true)
create the parameter estimator used for learning
StructuralConstraintSliceOrder constraintSliceOrder_
the constraint for 2TBNs
const DatabaseTable & database() const
returns the database used by the BNLearner
static constexpr double default_EM_noise
the default noise amount added to CPTs during EM's initialization (see method useEM())
void eraseNoParentNode(NodeId node)
const std::vector< double > & EMHistory() const
returns the history of the last EM execution
std::vector< double > rawPseudoCount(const std::vector< NodeId > &vars)
Return the pseudo-counts of NodeIds vars in the base in a raw array.
bool EMisEnabledMaxTime() const
bool inducedTypes_
the policy for typing variables
CorrectedMutualInformation::KModeTypes kmodeMiic_
the penalty used in MIIC
void EMsetEpsilon(double eps)
sets the stopping criterion of EM as being the minimal difference between two consecutive log-likelih...
double correctedMutualInformation(NodeId id1, NodeId id2, const std::vector< NodeId > &knowing={})
Return the mutual information of id1 and id2 in the base, conditioned by knowing for the BNLearner.
void useScoreBD()
indicate that we wish to use a BD score
void EMenableMaxTime()
sets the stopping criterion of EM as being the minimal difference between two consecutive log-likelih...
std::string EMStateMessage() const
returns the state of the EM algorithm
The local search with tabu list learning algorithm (for directed graphs).
The Miic learning algorithm.
the no a priorclass: corresponds to 0 weight-sample
The base class for estimating parameters of CPTs.
the base class for all a priori
The base class for all the scores used for learning (BIC, BDeu, etc).
The miic learning algorithm.
the structural constraint for forbidding the creation of some arcs during structure learning
the class for structural constraints limiting the number of parents of nodes in a directed graph
the structural constraint indicating that some arcs shall never be removed or reversed
the structural constraint for forbidding children for some nodes
the structural constraint for forbidding parents for some nodes
the structural constraint for forbidding the creation of some arcs except those defined in the class ...
the structural constraint imposing a partial order over nodes
The class imposing a N-sized tabu list as a structural constraints for learning algorithms.
Class building the essential Graph from a DAGmodel.
#define GUM_ERROR(type, msg)
The basic class for computing the set of digraph changes allowed by the user to be executed by the le...
The basic class for computing the set of digraph changes allowed by the user to be executed by the le...
The mecanism to compute the next available graph changes for directed structure learning search algor...
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Set< Edge > EdgeSet
Some typdefs and define for shortcuts ...
Size NodeId
Type for node ids.
Set< Arc > ArcSet
Some typdefs and define for shortcuts ...
HashTable< NodeId, VAL > NodeProperty
Property on graph elements.
The local search learning with tabu list algorithm (for directed graphs).
include the inlined functions if necessary
unsigned int getNumberOfThreads()
returns the max number of threads used by default when entering the next parallel region
the class for estimating parameters of CPTs using Maximum Likelihood
the class for computing AIC scores
the class for computing Bayesian Dirichlet (BD) log2 scores
the class for computing BDeu scores
the class for computing K2 scores (actually their log2 value)
#define GUM_EMIT1(signal, arg1)
#define GUM_EMIT3(signal, arg1, arg2, arg3)
the base class for structural constraints imposed by DAGs
the structural constraint for forbidding the creation of some arcs during structure learning
the class for structural constraints limiting the number of parents of nodes in a directed graph
the structural constraint indicating that some arcs shall never be removed or reversed
the structural constraint for forbidding children for some nodes during structure learning
the structural constraint for forbidding parents for some nodes during structure learning
the structural constraint for forbidding the creation of some arcs during structure learning
the structural constraint imposing a partial order over nodes
the class imposing a N-sized tabu list as a structural constraints for learning algorithms