diff --git a/.gitignore b/.gitignore
index dba3e43..471b709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -220,3 +220,4 @@ TSWLatexianTemp*
 .Rproj.user
 .Rhistory
 *.Rproj
+*.*~
diff --git a/chapitre20.tex b/chapitre20.tex
new file mode 100644
index 0000000..0abf19d
--- /dev/null
+++ b/chapitre20.tex
@@ -0,0 +1,106 @@
+\section{Chapitre 20: Apprentissage de modèles probabilistes}
+\label{sec:ch20}
+
+\subsection{Apprentissage bayesien}
+\label{sec:ch20apprbayesien}
+
+Forme d'apprentissage: mise à jour de la distribution de probabilités de l'espace des hypothèses.
+
+\begin{align}
+  P(h_i|\mathbf{d}) &= \underbrace{\alpha P(\mathbf{d} \mid h_i)}_{vraisemblance}\underbrace{P(h_i)}_{a priori}\\
+  P(X|\mathbf{d}) &= \sum_iP(X|\mathbf{d},h_i)P(h_i|\mathbf{d})\\
+                    &=\sum_iP(X|h_i)P(h_i|\mathbf{d})
+\end{align}
+
+\subsection{Apprentissage du maximum a posteriori et de la vraisemblance}
+\label{sec:ch20maximumvrai}
+
+Choisir $h_{MAP}$ maximisant $P(h_i|\mathbf{d})$:
+
+\begin{align}
+  h_{MAP} &= \argmax_{h_i}P(\mathbf{d}|h_i)P(h_i)\\
+          &= \argmax_{h_i} \log P(\mathbf{d}|h_i) + \log P(h_i) 
+\end{align}
+
+C'est la base de l'apprentissage à longueur de description minimale. Pour de grand ensembles de données, l'hypothèse à priori n'est pas pertinente. On utilise alors l'apprentissage par maximum de vraisemblance. On choisit $h_{ML}$ qui maximise $P(\mathbf{d}|h_i)$.
+
+\subsection{Apprentissage des réseaux bayesiens}
+\label{sec:ch20reseaux}
+
+On cherche une proportion $\theta$ d'éléments dans un ensemble de taille $N$ où $F=1$. L'hypothèse figure parmi un continuum d'hypothèses représenté par une distribution binomiale. On décompte $x$ exemples où $F=1$.
+
+\begin{align}
+  P(\mathbf{d}|h_{\theta}) &= \prod_{j=1}^NP(d_j|h_\theta)\\
+  &= \theta^x(1-\theta)^{N-x}
+\end{align}
+
+Maximiser cette probabilité est plus facile en passant par le logarithme:
+
+\begin{align}
+  L(\mathbf{d}|h_{\theta}) &= \log P(\mathbf{d}|h_{\theta})\\
+                           &= \sum {j=1}^N \log P(d_j|h_\theta)\\
+                           &= x \log \theta + (N-x) \log (1-\theta)\\
+  \frac{\partial L(\mathbf{d}|h_{\theta})}{\partial \theta} &= \frac{x}{\theta}-\frac{N-x}{1-\theta} = 0\\
+                           \Rightarrow \theta &= \frac{x}{x+N-x} = \frac{x}{N}
+\end{align}
+
+Si on ajoute une autre variable $W$ qui dépend de la première variable $F$, on ajoute par exemple les paramètres $\theta_1$ et $\theta_2$ pour $P(W|F=0)$ et $P(W|F=1)$. La vraisemblance devient:
+\begin{align}
+  P(F=1,W=0|h_{\theta,\theta_1,\theta_2}) &= P(F=1|h_{\theta,\theta_1,\theta_2})P(W=0|F=1,h_{\theta,\theta_1,\theta_2}) \\
+  &= \theta (1-\theta_1)
+\end{align}
+
+On a N éléments où $x_y$ valent $F=1,W=1$, $x_{\not y}$ valent $F=1,W=0$, $\not x_y$ valent $F=0,W=1$, $\not x_{\not y}$ valent $F=0,W=0$.
+
+\begin{align}
+  P(\mathbf{d}|h_{\theta,\theta_1,\theta_2}) &= \theta^x(1-\theta)^{N-x}\theta_1^{x_y}(1-\theta_1)^{\not x_y}\theta_2^{x_{not_y}}(1-\theta_2)^{\not x_{\not y}}\\
+  L (\mathbf{d}|h_{\theta,\theta_1,\theta_2}) &= x \log \theta + (N-x) \log (1-\theta)\\
+                                             &+ x_y \log \theta_1 + \not x_y \log (1-\theta_1)\\
+                                             &+ x_{\not_y} \log \theta_2 + \not x_{\not y} (1-\theta_2)
+\end{align}
+
+En utilisant des données complètes, le problème de l'apprentissage par vraisemblance des paramètres pour un réseau bayesien se décompose en un problème séparé pour chaque paramètre.
+
+\subsection{Modèle de Bayes Naif}
+\label{sec:ch20bayesnaif}
+
+\begin{itemize}
+\item Modèle simple qui fonctionne bien dans de nombreuses situations. 
+\item Assume l'indépendance conditionnelle des attributs par rapport à la valeur. 
+\item On évalue les probabilités depuis les exemples d'apprentissage, puis on multiplue les probabilités conditionnelles pour chacunes des valeurs de la variable réponse. 
+\item On choisit la variable réponse avec le plus grande probabilité. 
+\end{itemize}
+
+\begin{align}
+  v = \argmax_{v_j \in V}P(v_j)\prod_{i}P(X_i|v_j)
+\end{align}
+
+\subsection{Modèles continus}
+\label{sec:ch20modelescont}
+
+Hypothèses:
+
+\begin{itemize}
+\item Lignes droites avec un bruit gaussien.
+\item On choisit les paramètres $\theta = (a,b)$ qui maximisent la vraisemblance.
+\item Les exemples dont indépendants et identiquement distribués
+\end{itemize}
+
+\begin{align}
+  P(d_j|h_i) &= \alpha \exp (-\frac{y_j-(ax_j+b)^2}{2\sigma^2})\\
+  L(d_j|h_i) &= -\alpha^{\prime}\sum_j (y_j-(ax_j+b))^2\\
+  \frac{\partial L}{\partial a} &= -\alpha^{\prime} \sum_j 2(y_j-(ax_j+b))(-x_j)=0\\
+  \frac{\partial L}{\partial b} &= -\alpha^{\prime} \sum_j 2(y_j-(ax_j+b))(-1)=0\\
+\end{align}
+
+On résous pour les paramètres $a$ et $b$:
+
+\begin{align}
+  a &= \frac{\sum_j x_j \sum_j y_j - N\sum_j x_jy_j}{(\sum_j x_j)^2 - N \sum_j x_j^2}\\
+  b &= \frac{1}{N}(\sum_j y_j - a \sum_j x_j)
+\end{align}
+
+\subsection{Apprentissage avec variables cachées (EM)}
+\label{sec:ch20em}
+
+
diff --git a/notes_de_cours.tex b/notes_de_cours.tex
index f7ca96c..fa8aa7e 100644
--- a/notes_de_cours.tex
+++ b/notes_de_cours.tex
@@ -109,6 +109,7 @@
 \include{chapitre16}
 \include{chapitre17}
 \include{chapitre18}
+\include{chapitre20}
 % Ajouter les autres chapitres au besoin
 
 \bibliography{bibliographie}