Parametric EM (missing data)

Creative Commons License

aGrUM

interactive online version

In [1]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb

import os
#the bases will be saved in "out/*.csv"
EMnomissing="out/EM_nomissing.csv"
EMmissing="out/EM_missing.csv"

Generating data with missing values (at random)

In [2]:
src=gum.fastBN("A->B<-C->D->E<-B;D->F")
gum.generateSample(src,5000,EMnomissing,random_order=False)
src
Out[2]:
G B B E E B->E F F C C C->B D D C->D A A A->B D->F D->E
In [3]:
import pandas as pd
import numpy as np

def add_missing(src,dst,proba):
  df=pd.read_csv(src)
  mask=np.random.choice([True, False], size=df.shape,p=[proba,1-proba])
  df.mask(mask).to_csv(dst,na_rep='?',index=False,float_format='%.0f')

gum.generateSample(src,5000,EMnomissing,random_order=False)
add_missing(EMnomissing,EMmissing,proba=0.1)
In [4]:
print("No missing")
with open(EMnomissing,"r") as srcfile:
    for _ in range(10):
        print(srcfile.readline(),end="")
print("Missing")
with open(EMmissing,"r") as srcfile:
    for _ in range(10):
        print(srcfile.readline(),end="")
No missing
A,B,C,D,E,F
1,1,0,1,1,0
1,1,0,1,0,1
0,0,1,0,0,1
0,0,1,0,0,0
1,1,0,1,0,0
1,0,1,0,0,0
0,1,1,0,1,0
1,0,1,0,0,1
1,0,1,0,0,1
Missing
A,B,C,D,E,F
1,1,0,1,1,0
1,1,0,1,0,1
0,0,1,0,0,1
0,0,1,0,0,0
1,?,0,1,0,?
1,0,1,0,0,0
0,1,1,0,1,0
1,0,1,0,0,1
?,0,1,0,0,1

Learning with missing data

In [5]:
learner = gum.BNLearner(EMmissing,src, ["?"])
print(f"Missing values in {EMmissing} : {learner.hasMissingValues()}")
Missing values in out/EM_missing.csv : True
In [6]:
try:
  learner.learnParameters(src.dag())
except gum.MissingValueInDatabase:
  print("Learning is not possible without EM if there are some missing values.")
Learning is not possible without EM if there are some missing values.
In [7]:
learner.useEM(1e-3)
learner.useSmoothingPrior()
print(learner)
bn=learner.learnParameters(src.dag())
print(f"# iterations : {learner.nbrIterations()}")
gnb.flow.row(gnb.getInference(src),gnb.getInference(bn))
Filename       : out/EM_missing.csv
Size           : (5000,6)
Variables      : A[2], B[2], C[2], D[2], E[2], F[2]
Induced types  : False
Missing values : True
Algorithm      : Greedy Hill Climbing
Score          : BDeu
Correction     : MDL  (Not used for score-based algorithms)
Prior          : Smoothing  (The BDeu score already contains a different 'implicit' prior. Therefore, the learning will probably be biased.)
Prior weight   : 1.000000
EM             : True
EM epsilon     : 0.001000

# iterations : 6
structs Inference in   0.32ms A 2023-05-09T10:07:56.084200 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B 2023-05-09T10:07:56.147125 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ A->B E 2023-05-09T10:07:56.326849 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B->E C 2023-05-09T10:07:56.207940 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->B D 2023-05-09T10:07:56.268342 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->D D->E F 2023-05-09T10:07:56.386748 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ D->F
structs Inference in   0.41ms A 2023-05-09T10:07:56.635206 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B 2023-05-09T10:07:56.699331 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ A->B E 2023-05-09T10:07:56.889404 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B->E C 2023-05-09T10:07:56.765113 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->B D 2023-05-09T10:07:56.828037 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->D D->E F 2023-05-09T10:07:56.957159 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ D->F

Learning with smaller error (and no smoothing)

In [8]:
learner = gum.BNLearner(EMmissing,src, ["?"])
learner.setVerbosity(True)
learner.useEM(1e-8)
bn2=learner.learnParameters(src.dag())
gnb.flow.row(gnb.getInference(src),gnb.getInference(bn2),captions=["Source",f"Estimation EM en {learner.nbrIterations()} iteration(s)"])
structs Inference in   0.35ms A 2023-05-09T10:07:58.637349 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B 2023-05-09T10:07:58.786267 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ A->B E 2023-05-09T10:07:58.968281 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B->E C 2023-05-09T10:07:58.847238 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->B D 2023-05-09T10:07:58.906748 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->D D->E F 2023-05-09T10:07:59.028465 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ D->F
Source
structs Inference in   0.55ms A 2023-05-09T10:07:59.200336 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B 2023-05-09T10:07:59.258654 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ A->B E 2023-05-09T10:07:59.432800 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ B->E C 2023-05-09T10:07:59.316887 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->B D 2023-05-09T10:07:59.375702 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ C->D D->E F 2023-05-09T10:07:59.571006 image/svg+xml Matplotlib v3.7.1, https://matplotlib.org/ D->F
Estimation EM en 13 iteration(s)
In [9]:
import matplotlib.pyplot as plt
import numpy as np
plt.plot(np.arange(1,1+learner.nbrIterations()),learner.history())
plt.xticks(np.arange(1, 1+learner.nbrIterations(), step=2))
plt.title("Error during EM iterations");
../_images/notebooks_35-Learning_ParametricEM_13_0.svg
In [ ]: