diff --git a/PFR/entropy_basic.lean b/PFR/entropy_basic.lean
index dbaaa58e..03237c5b 100644
--- a/PFR/entropy_basic.lean
+++ b/PFR/entropy_basic.lean
@@ -1,86 +1,435 @@
-import PFR.probability_space
+
 import PFR.neg_xlogx
--- to do: be more specific on which part of Mathlib is being imported
-
-/-- The purpose of this file is to develop the basic theory of Shannon entropy. -/
-
-/- In this file, inversion will always mean inversion of real numbers. -/
-local macro_rules | `($x ⁻¹)   => `(Inv.inv ($x : ℝ))
-
-open Real
-open BigOperators
-
-variable {Ω S : Type*} [ProbabilitySpace Ω] [Fintype S] {X : Ω → S}
-
-/-- The entropy of a random variable. -/
-noncomputable def entropy (X : Ω → S) : ℝ := ∑ s, h (P[X ⁻¹' {s}])
-
-notation:100 "H[ " X " ]" => entropy X
-
-/-- Entropy is non-negative -/
-lemma entropy_nonneg (X : Ω → S) : 0 ≤ H[X] := by
-  unfold entropy
-  apply Finset.sum_nonneg
-  intro s _
-  apply h_nonneg
-  . simp
-  apply ProbabilitySpace.prob_le_one
-
-/-- Entropy vanishes in the degenerate case -/
-lemma entropy_zero  (hΩ : ¬ProbabilitySpace.isNondeg Ω) (X : Ω → S) : H[X] = 0 := by
-  unfold entropy
-  conv =>
-    lhs; congr; rfl; ext s
-    rw [ProbabilitySpace.prob_zero hΩ]
-  unfold h; simp
-
-/-- The Jensen bound -/
-lemma entropy_le_log (hX : Measurable X): H[X] ≤ log (Fintype.card S) := by
-  by_cases hΩ : ProbabilitySpace.isNondeg Ω
-  . set N := Fintype.card S
-    have : 0 < N := ProbabilitySpace.range_nonempty' hΩ hX
-    unfold entropy
-    have hN : log N = N * h (∑ s : S, N⁻¹ * P[ X ⁻¹' {s} ]) := by
-      rw [<-Finset.mul_sum]
-      norm_cast
-      rw [ProbabilitySpace.totalProb hΩ hX]
+import Mathlib.Probability.Notation
+import Mathlib.Probability.ConditionalProbability
+
+/-!
+# Entropy and conditional entropy
+
+## Main definitions
+
+* `measureEntropy`: entropy of a measure
+* `entropy`: entropy of a random variable, defined as `measureEntropy (volume.map X)`
+* `condEntropy`: conditional entropy of a random variable `X` w.r.t. another one `Y`
+* `mutualInformation`: mutual information of two random variables
+
+## Main statements
+
+* `chain_rule`: `entropy (fun ω ↦ (X ω, Y ω)) = entropy Y + condEntropy X Y`
+
+## Notations
+
+* `Hm[μ] = measureEntropy μ`
+* `H[X] = entropy X`
+* `H[X | Y ← y] = Hm[(ℙ[| Y ⁻¹' {y}]).map X]`
+* `H[X | Y] = condEntropy X Y`, such that `H[X | Y] = (volume.map Y)[fun y ↦ H[X | Y ← y]]`
+* `I[X : Y] = mutualInformation X Y`
+
+All notations except for `Hm[]` have variants where we can specify the measure (which is otherwise
+supposed to be `volume`). For example `H[X ; μ]` and `I[X : Y ; μ]` instead of `H[X]` and
+`I[X : Y]` respectively.
+
+-/
+
+open Real MeasureTheory
+
+open scoped ENNReal NNReal Topology ProbabilityTheory BigOperators
+
+section aux_lemmas
+
+-- todo: is this somewhere?
+lemma integral_eq_sum {S E : Type*} [Fintype S] [MeasurableSpace S] [MeasurableSingletonClass S]
+    [NormedAddCommGroup E] [NormedSpace ℝ E] [CompleteSpace E]
+     (μ : Measure S) [IsFiniteMeasure μ] (f : S → E) :
+    ∫ x, f x ∂μ = ∑ x, (μ {x}).toReal • (f x) := by
+  conv_lhs => rw [← Measure.sum_smul_dirac μ]
+  have hf : Integrable f μ := integrable_of_fintype _ _
+  have hf' : Integrable f (Measure.sum fun a ↦ μ {a} • Measure.dirac a) := by
+    rwa [Measure.sum_smul_dirac μ]
+  rw [integral_sum_measure hf']
+  simp_rw [integral_smul_measure, integral_dirac]
+  rw [tsum_fintype]
+
+/-- `μ[|s]` is a finite measure whenever `μ` is finite. -/
+instance cond_isFiniteMeasure {α : Type*} {mα : MeasurableSpace α} {μ : Measure α}
+    [IsFiniteMeasure μ] (s : Set α) :
+    IsFiniteMeasure (μ[|s]) := by
+  constructor
+  rw [ProbabilityTheory.cond]
+  simp only [Measure.smul_toOuterMeasure, OuterMeasure.coe_smul, Pi.smul_apply, MeasurableSet.univ,
+    Measure.restrict_apply, Set.univ_inter, smul_eq_mul]
+  by_cases hμs : μ s = 0
+  · simp [hμs]
+  · refine ENNReal.mul_lt_top ?_ (measure_ne_top _ _)
+    simp only [ne_eq, ENNReal.inv_eq_top]
+    exact hμs
+
+lemma cond_eq_zero_of_measure_zero {α : Type*} {_ : MeasurableSpace α} {μ : Measure α} {s : Set α}
+    (hμs : μ s = 0) :
+    μ[|s] = 0 := by
+  have : μ.restrict s = 0 := by simp [hμs]
+  simp [ProbabilityTheory.cond, this]
+
+lemma sum_measure_singleton {S : Type*} [Fintype S] {_ : MeasurableSpace S}
+    [MeasurableSingletonClass S]  (μ : Measure S) [IsProbabilityMeasure μ] :
+    ∑ x, μ {x} = 1 := by
+  change ∑ x, μ (id ⁻¹' {x}) = 1
+  rw [sum_measure_preimage_singleton]
+  · simp
+  · simp
+
+end aux_lemmas
+
+
+
+
+namespace ProbabilityTheory
+
+variable {Ω S T : Type*} [mΩ : MeasurableSpace Ω]
+  [Fintype S] [Fintype T] [MeasurableSpace S] [MeasurableSpace T]
+
+section measureEntropy
+
+variable {μ : Measure S}
+
+/-- Entropy of a measure on a finite measurable space.
+
+We normalize the measure by `(μ Set.univ)⁻¹` to extend the entropy definition to finite measures.
+What we really want to do is deal with `μ=0` or `IsProbabilityMeasure μ`, but we don't have
+a typeclass for that (we could create one though).
+The added complexity in the expression is not an issue because if `μ` is a probability measure,
+a call to `simp` will simplify `(μ Set.univ)⁻¹ • μ` to `μ`. -/
+noncomputable
+def measureEntropy (μ : Measure S := by volume_tac) : ℝ :=
+  ∑ s, negIdMulLog (((μ Set.univ)⁻¹ • μ) {s}).toReal
+
+lemma measureEntropy_def (μ : Measure S) :
+    measureEntropy μ = ∑ s, negIdMulLog (((μ Set.univ)⁻¹ • μ) {s}).toReal := rfl
+
+notation:100 "Hm[" μ "]" => measureEntropy μ
+
+@[simp]
+lemma measureEntropy_zero : Hm[(0 : Measure S)] = 0 := by
+  simp [measureEntropy]
+
+lemma measureEntropy_of_not_isFiniteMeasure (h : ¬ IsFiniteMeasure μ) :
+    Hm[μ] = 0 := by
+  simp [measureEntropy, not_isFiniteMeasure_iff.mp h]
+
+lemma measureEntropy_of_isProbabilityMeasure (μ : Measure S) [IsProbabilityMeasure μ] :
+    Hm[μ] = ∑ s, negIdMulLog (μ {s}).toReal := by
+  simp [measureEntropy]
+
+lemma measureEntropy_univ_smul : Hm[(μ Set.univ)⁻¹ • μ] = Hm[μ] := by
+  by_cases hμ_fin : IsFiniteMeasure μ
+  swap
+  · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin]
+    rw [not_isFiniteMeasure_iff] at hμ_fin
+    simp [hμ_fin]
+  cases eq_zero_or_neZero μ with
+  | inl hμ => simp [hμ]
+  | inr hμ =>
+    rw [measureEntropy_def]
+    simp only [Measure.smul_toOuterMeasure, OuterMeasure.coe_smul, Pi.smul_apply, smul_eq_mul,
+      ENNReal.toReal_mul]
+    rw [ENNReal.inv_mul_cancel]
+    · simp only [inv_one, ENNReal.one_toReal, one_mul]
+      simp [measureEntropy]
+    · simp [hμ.out]
+    · exact measure_ne_top _ _
+
+lemma measureEntropy_nonneg (μ : Measure S) : 0 ≤ Hm[μ] := by
+  by_cases hμ_fin : IsFiniteMeasure μ
+  swap; · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin]
+  refine Finset.sum_nonneg (fun s _ ↦ negIdMulLog_nonneg ENNReal.toReal_nonneg ?_)
+  refine ENNReal.toReal_le_of_le_ofReal zero_le_one ?_
+  rw [ENNReal.ofReal_one]
+  cases eq_zero_or_neZero μ with
+  | inl hμ => simp [hμ]
+  | inr hμ => exact prob_le_one
+
+lemma measureEntropy_le_card_aux [MeasurableSingletonClass S]
+    (μ : Measure S) [IsProbabilityMeasure μ] :
+    Hm[μ] ≤ log (Fintype.card S) := by
+  cases isEmpty_or_nonempty S with
+  | inl h =>
+    have : μ = 0 := Subsingleton.elim _ _
+    simp [Fintype.card_eq_zero, this]
+  | inr h =>
+    set N := Fintype.card S
+    have : 0 < N := Fintype.card_pos
+    rw [measureEntropy_def]
+    simp only [measure_univ, inv_one, one_smul]
+    calc ∑ x, negIdMulLog (μ {x}).toReal
+      = ∑ x, negIdMulLog (μ {x}).toReal := rfl
+    _ = N * ∑ x, (N : ℝ)⁻¹ * negIdMulLog (μ {x}).toReal := by
+        rw [Finset.mul_sum]
+        congr with x
+        rw [← mul_assoc, mul_inv_cancel, one_mul]
+        simp [this.ne']
+    _ ≤ N * negIdMulLog (∑ x : S, (N : ℝ)⁻¹ * (μ {x}).toReal) := by
+        refine mul_le_mul le_rfl ?_ ?_ ?_
+        · refine sum_negIdMulLog_le ?_ ?_ ?_
+          · simp
+          · simp [Finset.card_univ]
+          · exact fun _ ↦ ENNReal.toReal_nonneg
+        · refine Finset.sum_nonneg (fun x _ ↦ ?_)
+          refine mul_nonneg ?_ ?_
+          · simp [this]
+          · refine negIdMulLog_nonneg (by simp) ?_
+            refine ENNReal.toReal_le_of_le_ofReal zero_le_one ?_
+            rw [ENNReal.ofReal_one]
+            exact prob_le_one
+        · positivity
+    _ = N * negIdMulLog ((N : ℝ)⁻¹) := by
+        congr
+        rw [← Finset.mul_sum]
+        simp only [ne_eq, inv_eq_zero, Nat.cast_eq_zero, Fintype.card_ne_zero, not_false_eq_true,
+          mul_eq_left₀]
+        rw [← ENNReal.toReal_sum (fun _ _ ↦ measure_ne_top _ _), ENNReal.toReal_eq_one_iff]
+        exact sum_measure_singleton _
+    _ = - log ((N : ℝ)⁻¹) := by
+        simp [negIdMulLog]
+    _ = log (Fintype.card S) := by simp [Finset.card_univ]
+
+lemma measureEntropy_le_log_card [MeasurableSingletonClass S] (μ : Measure S) :
+    Hm[μ] ≤ log (Fintype.card S) := by
+  have h_log_card_nonneg : 0 ≤ log (Fintype.card S) := by
+    cases isEmpty_or_nonempty S with
+    | inl h =>
+      rw [Fintype.card_eq_zero]
       simp
-      unfold h
-      rw [log_inv]
-      field_simp; ring
-    rw [hN, <- inv_mul_le_iff, Finset.mul_sum]
-    set w := fun _ : S ↦ N⁻¹
-    set p := fun s : S ↦ (P[ X ⁻¹' {s} ] : ℝ)
-
-    conv =>
-      congr
-      . congr; rfl
-        ext s
-        rw [(show N⁻¹ = w s by simp), (show P[ X ⁻¹' {s} ] = p s by simp)]
-      congr; congr; rfl
-      ext s
-      rw [(show N⁻¹ = w s by simp), (show P[ X ⁻¹' {s} ] = p s by simp)]
-    apply h_jensen
-    . intros; simp
-    . simp; apply mul_inv_cancel; positivity
-    . intro s _
-      simp; norm_cast
-      exact ProbabilitySpace.prob_le_one (X ⁻¹' {s})
-    positivity
-  rw [entropy_zero hΩ]
-  positivity
-
-/-- Equality in Jensen is attained when X is uniform.  TODO: also establish converse.  One could
-also remove hΩ but this seems of little use.  -/
-lemma entropy_of_uniform (hΩ : ProbabilitySpace.isNondeg Ω) (hX : ProbabilitySpace.isUniform X) :
-    H[X] = log (Fintype.card S) := by
-  rcases hX with ⟨ hX1, hX2 ⟩
-  unfold entropy
-  conv =>
-    lhs; congr; rfl; ext s
-    rw [hX2 s]
-  simp [h]
-  have := ProbabilitySpace.range_nonempty' hΩ hX1
-  field_simp
-  rw [mul_comm]
+    | inr h =>
+      refine log_nonneg ?_
+      simp only [Nat.one_le_cast]
+      exact Fintype.card_pos
+  cases eq_zero_or_neZero μ with
+  | inl hμ =>
+    simp only [hμ, measureEntropy_zero]
+    exact h_log_card_nonneg
+  | inr hμ =>
+    by_cases hμ_fin : IsFiniteMeasure μ
+    swap;
+    · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin]
+      exact h_log_card_nonneg
+    rw [← measureEntropy_univ_smul]
+    exact measureEntropy_le_card_aux _
+
+end measureEntropy
+
+section entropy
+
+variable {μ : Measure Ω} {X : Ω → S} {Y : Ω → T}
+
+/-- Entropy of a random variable with values in a finite measurable space. -/
+noncomputable
+def entropy (X : Ω → S) (μ : Measure Ω := by volume_tac) := Hm[μ.map X]
+
+notation3:100 "H[" X "; " μ "]" => entropy X μ
+notation3:100 "H[" X "]" => entropy X volume
+notation3:100 "H[" X "|" Y "←" y "; " μ "]" => entropy X (μ[|Y ⁻¹' {y}])
+notation3:100 "H[" X "|" Y "←" y "]" => entropy X (ℙ[|Y ⁻¹' {y}])
+
+lemma entropy_def (X : Ω → S) (μ : Measure Ω) : entropy X μ = Hm[μ.map X] := rfl
+
+lemma entropy_nonneg (X : Ω → S) (μ : Measure Ω) : 0 ≤ entropy X μ := measureEntropy_nonneg _
+
+lemma entropy_le_log_card [MeasurableSingletonClass S]
+    (X : Ω → S) (μ : Measure Ω) : entropy X μ ≤ log (Fintype.card S) :=
+  measureEntropy_le_log_card _
+
+lemma entropy_eq_sum (hX : Measurable X) (μ : Measure Ω) [IsProbabilityMeasure μ] :
+    entropy X μ = ∑ x, negIdMulLog (μ.map X {x}).toReal := by
+  have : IsProbabilityMeasure (Measure.map X μ) := isProbabilityMeasure_map hX.aemeasurable
+  rw [entropy_def, measureEntropy_of_isProbabilityMeasure]
+
+lemma entropy_cond_eq_sum (hX : Measurable X) (μ : Measure Ω) [IsProbabilityMeasure μ] (y : T) :
+    H[X | Y ← y ; μ] = ∑ x, negIdMulLog ((μ[|Y ⁻¹' {y}]).map X {x}).toReal := by
+  by_cases hy : μ (Y ⁻¹' {y}) = 0
+  · rw [entropy_def, cond_eq_zero_of_measure_zero hy]
+    simp
+  · have : IsProbabilityMeasure (μ[|Y ⁻¹' {y}]) := cond_isProbabilityMeasure _ hy
+    rw [entropy_eq_sum hX]
+
+end entropy
+
+section condEntropy
+
+variable {X : Ω → S} {Y : Ω → T}
+
+/-- Conditional entropy of a random variable w.r.t. another.
+This is the expectation under the law of `Y` of the entropy of the law of `X` conditioned on the
+event `Y = y`. -/
+noncomputable
+def condEntropy (X : Ω → S) (Y : Ω → T) (μ : Measure Ω := by volume_tac) : ℝ :=
+  (μ.map Y)[fun y ↦ H[X | Y ← y ; μ]]
+
+lemma condEntropy_def (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) :
+    condEntropy X Y μ = (μ.map Y)[fun y ↦ H[X | Y ← y ; μ]] := rfl
+
+notation3:100 "H[" X "|" Y "; " μ "]" => condEntropy X Y μ
+notation3:100 "H[" X "|" Y "]" => condEntropy X Y volume
+
+lemma condEntropy_nonneg (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) : 0 ≤ H[X | Y ; μ] :=
+  integral_nonneg (fun _ ↦ measureEntropy_nonneg _)
+
+lemma condEntropy_le_log_card [MeasurableSingletonClass S]
+    (X : Ω → S) (Y : Ω → T) (hY : Measurable Y) (μ : Measure Ω) [IsProbabilityMeasure μ] :
+    H[X | Y ; μ] ≤ log (Fintype.card S) := by
+  refine (integral_mono_of_nonneg ?_ (integrable_const (log (Fintype.card S))) ?_).trans ?_
+  · exact ae_of_all _ (fun _ ↦ entropy_nonneg _ _)
+  · exact ae_of_all _ (fun _ ↦ entropy_le_log_card _ _)
+  · have : IsProbabilityMeasure (μ.map Y) := isProbabilityMeasure_map hY.aemeasurable
+    simp
+
+lemma condEntropy_eq_sum [MeasurableSingletonClass T] (X : Ω → S) (Y : Ω → T) (μ : Measure Ω)
+    [IsProbabilityMeasure μ] :
+    H[X | Y ; μ] = ∑ y, (μ.map Y {y}).toReal * H[X | Y ← y ; μ] := by
+  rw [condEntropy_def, integral_eq_sum]
+  simp_rw [smul_eq_mul]
+
+lemma condEntropy_eq_sum_sum [MeasurableSingletonClass T] (hX : Measurable X) (Y : Ω → T)
+    (μ : Measure Ω) [IsProbabilityMeasure μ] :
+    H[X | Y ; μ]
+      = ∑ y, ∑ x, (μ.map Y {y}).toReal * negIdMulLog ((μ[|Y ⁻¹' {y}]).map X {x}).toReal := by
+  rw [condEntropy_eq_sum]
+  congr with y
+  rw [entropy_cond_eq_sum hX, Finset.mul_sum]
+
+lemma condEntropy_eq_sum_prod [MeasurableSingletonClass T] (hX : Measurable X) (Y : Ω → T)
+    (μ : Measure Ω) [IsProbabilityMeasure μ] :
+    H[X | Y ; μ] = ∑ p : S × T,
+      (μ.map Y {p.2}).toReal * negIdMulLog ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by
+  have h_prod : (Finset.univ : Finset (S × T)) = (Finset.univ : Finset S) ×ˢ Finset.univ := rfl
+  rw [condEntropy_eq_sum_sum hX Y, h_prod, Finset.sum_product_right]
+
+end condEntropy
+
+section pair
+
+variable {X : Ω → S} {Y : Ω → T}
+
+lemma measure_prod_singleton_eq_mul [MeasurableSingletonClass S] [MeasurableSingletonClass T]
+    {μ : Measure Ω} [IsFiniteMeasure μ] (hX : Measurable X) (hY : Measurable Y) {p : S × T} :
+    (μ.map (fun ω ↦ (X ω, Y ω)) {p}).toReal
+      = (μ.map Y {p.2}).toReal * ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by
+  have hp_prod : {p} = {p.1} ×ˢ {p.2} := by simp
+  rw [Measure.map_apply (hX.prod_mk hY) (measurableSet_singleton p)]
+  by_cases hpY : μ (Y ⁻¹' {p.2}) = 0
+  · rw [cond_eq_zero_of_measure_zero hpY]
+    simp only [aemeasurable_zero_measure, not_true, Measure.map_zero, Measure.zero_toOuterMeasure,
+      OuterMeasure.coe_zero, Pi.zero_apply, ENNReal.zero_toReal, mul_zero]
+    suffices (μ ((fun a ↦ (X a, Y a)) ⁻¹' ({p.1} ×ˢ {p.2}))).toReal = 0 by convert this
+    rw [Set.mk_preimage_prod, ENNReal.toReal_eq_zero_iff]
+    exact Or.inl (measure_mono_null (Set.inter_subset_right _ _) hpY)
+  rw [Measure.map_apply hY (measurableSet_singleton p.2)]
+  simp_rw [Measure.map_apply hX (measurableSet_singleton _)]
+  simp_rw [cond_apply _ (hY (measurableSet_singleton _))]
+  rw [ENNReal.toReal_mul, ← mul_assoc, ENNReal.toReal_inv, mul_inv_cancel, one_mul, hp_prod,
+    Set.mk_preimage_prod, Set.inter_comm]
+  rw [ENNReal.toReal_ne_zero]; exact ⟨hpY, measure_ne_top _ _⟩
+
+lemma negIdMulLog_measure_prod_singleton [MeasurableSingletonClass S] [MeasurableSingletonClass T]
+    {μ : Measure Ω} [IsFiniteMeasure μ] (hX : Measurable X) (hY : Measurable Y) {p : S × T} :
+    negIdMulLog (μ.map (fun ω ↦ (X ω, Y ω)) {p}).toReal
+      = - ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal
+          * (μ.map Y {p.2}).toReal* log (μ.map Y {p.2}).toReal
+        - (μ.map Y {p.2}).toReal * ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal
+          * log ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by
+  rw [measure_prod_singleton_eq_mul hX hY]
+  by_cases hpY : μ (Y ⁻¹' {p.2}) = 0
+  · rw [cond_eq_zero_of_measure_zero hpY]
+    simp
+  by_cases hpX : (μ[|Y ⁻¹' {p.2}]).map X {p.1} = 0
+  · simp [hpX]
+  rw [negIdMulLog, log_mul]
+  · ring
+  · rw [ENNReal.toReal_ne_zero]
+    refine ⟨?_, measure_ne_top _ _⟩
+    rwa [Measure.map_apply hY (measurableSet_singleton _)]
+  · rw [ENNReal.toReal_ne_zero]
+    refine ⟨hpX, measure_ne_top _ _⟩
+
+lemma chain_rule [MeasurableSingletonClass S] [MeasurableSingletonClass T]
+    (μ : Measure Ω) [IsProbabilityMeasure μ] (hX : Measurable X) (hY : Measurable Y) :
+    H[fun ω ↦ (X ω, Y ω) ; μ] = H[Y ; μ] + H[X | Y ; μ] := by
+  have : IsProbabilityMeasure (μ.map Y) := isProbabilityMeasure_map hY.aemeasurable
+  have : IsProbabilityMeasure (μ.map (fun ω ↦ (X ω, Y ω))) :=
+    isProbabilityMeasure_map (hX.prod_mk hY).aemeasurable
+  rw [entropy_def, measureEntropy_of_isProbabilityMeasure]
+  simp_rw [negIdMulLog_measure_prod_singleton hX hY, sub_eq_add_neg, Finset.sum_add_distrib]
   congr
+  · have h_prod : (Finset.univ : Finset (S × T)) = (Finset.univ : Finset S) ×ˢ Finset.univ := rfl
+    rw [h_prod, Finset.sum_product_right, entropy_def, measureEntropy_of_isProbabilityMeasure]
+    congr with y
+    simp only [neg_mul, Finset.sum_neg_distrib]
+    rw [← Finset.sum_mul, ← Finset.sum_mul]
+    by_cases hy : μ (Y ⁻¹' {y}) = 0
+    · simp [cond_eq_zero_of_measure_zero hy, Measure.map_apply hY, hy]
+    have : IsProbabilityMeasure (μ[|Y ⁻¹' {y}]) := cond_isProbabilityMeasure _ hy
+    suffices ∑ x : S, ((μ[|Y ⁻¹' {y}]).map X {x}).toReal = 1 by
+      rw [this, one_mul, ← neg_mul, negIdMulLog]
+    rw [← ENNReal.toReal_sum (fun _ _ ↦ measure_ne_top _ _), ENNReal.toReal_eq_one_iff]
+    simp_rw [Measure.map_apply hX (measurableSet_singleton _)]
+    rw [sum_measure_preimage_singleton _ (fun y _ ↦ hX (measurableSet_singleton y))]
+    simp
+  · rw [condEntropy_eq_sum_prod hX]
+    congr with p
+    rw [negIdMulLog]
+    ring
+
+end pair
+
+section mutualInformation
+
+variable {U : Type*} [Fintype U] [MeasurableSpace U]
+  {X : Ω → S} {Y : Ω → T} {Z : Ω → U} {μ : Measure Ω}
+
+/-- Mutual information (TODO docstring). -/
+noncomputable
+def mutualInformation (X : Ω → S) (Y : Ω → T) (μ : Measure Ω := by volume_tac) : ℝ :=
+  H[X ; μ] + H[Y ; μ] - H[fun ω ↦ (X ω, Y ω) ; μ]
+
+lemma mutualInformation_def (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) :
+  mutualInformation X Y μ = H[X ; μ] + H[Y ; μ] - H[fun ω ↦ (X ω, Y ω) ; μ] := rfl
+
+notation3:100 "I[" X ":" Y ";" μ "]" => mutualInformation X Y μ
+notation3:100 "I[" X ":" Y "]" => mutualInformation X Y volume
+
+lemma mutualInformation_eq_entropy_sub_condEntropy [MeasurableSingletonClass S]
+    [MeasurableSingletonClass T] (hX : Measurable X) (hY : Measurable Y) (μ : Measure Ω)
+    [IsProbabilityMeasure μ] :
+    I[X : Y ; μ] = H[X ; μ] - H[X | Y ; μ] := by
+  rw [mutualInformation_def, chain_rule μ hX hY]
+  abel
+
+noncomputable
+def condMutualInformation (X : Ω → S) (Y : Ω → T) (Z : Ω → U) (μ : Measure Ω := by volume_tac) :
+    ℝ := (μ.map Z)[fun z ↦ H[X | Z ← z ; μ] + H[Y | Z ← z ; μ] - H[fun ω ↦ (X ω, Y ω) | Z ← z ; μ]]
+
+lemma condMutualInformation_def (X : Ω → S) (Y : Ω → T) (Z : Ω → U) (μ : Measure Ω) :
+    condMutualInformation X Y Z μ = (μ.map Z)[fun z ↦
+      H[X | Z ← z ; μ] + H[Y | Z ← z ; μ] - H[fun ω ↦ (X ω, Y ω) | Z ← z ; μ]] := rfl
+
+lemma condMutualInformation_eq_integral_mutualInformation :
+    condMutualInformation X Y Z μ = (μ.map Z)[fun z ↦ I[X : Y ; μ[|Z ⁻¹' {z}]]] := rfl
+
+end mutualInformation
+
+end ProbabilityTheory
+
+
+
+section MeasureSpace_example
+
+open ProbabilityTheory
+
+variable {Ω S T : Type*} [MeasureSpace Ω] [IsProbabilityMeasure (ℙ : Measure Ω)]
+  [Fintype S] [MeasurableSpace S] [MeasurableSingletonClass S]
+  [Fintype T] [MeasurableSpace T] [MeasurableSingletonClass T]
+  {X : Ω → S} {Y : Ω → T}
+
+example (hX : Measurable X) (hY : Measurable Y) :
+  H[fun ω ↦ (X ω, Y ω)] = H[Y] + H[X | Y] := chain_rule _ hX hY
+
+end MeasureSpace_example
diff --git a/PFR/neg_xlogx.lean b/PFR/neg_xlogx.lean
index 2b20de42..ca6a9637 100644
--- a/PFR/neg_xlogx.lean
+++ b/PFR/neg_xlogx.lean
@@ -1,4 +1,5 @@
 import Mathlib.Analysis.SpecialFunctions.Log.Deriv
+import Mathlib.Analysis.SpecialFunctions.Pow.Asymptotics
 import Mathlib.Topology.Semicontinuous
 
 /-!
@@ -7,147 +8,123 @@ import Mathlib.Topology.Semicontinuous
 The purpose of this file is to record basic analytic properties of the function
 $$h(x) = - x * log x$$ on the unit interval, for use in the theory of Shannon entropy.
 
-Thanks to Heather Macbeth for optimizations.
 -/
 
-open Real
+open scoped ENNReal NNReal Topology BigOperators
 
-variable {x : ℝ}
+namespace Real
 
-/- In this file, inversion will always mean inversion of real numbers. -/
-local macro_rules | `($x ⁻¹) => `(Inv.inv ($x : ℝ))
+lemma tendsto_log_mul_nhds_zero_left :
+    Filter.Tendsto (fun x ↦ log x * x) (𝓝[<] 0) (𝓝 0) := by
+  have h := tendsto_log_mul_rpow_nhds_zero zero_lt_one
+  simp only [rpow_one] at h
+  have h_eq : ∀ x ∈ Set.Iio 0, (- (fun x ↦ log x * x) ∘ (fun x ↦ |x|)) x = log x * x := by
+    intro x hx
+    simp only [Set.mem_Iio] at hx
+    simp only [Pi.neg_apply, Function.comp_apply, log_abs]
+    rw [abs_of_nonpos hx.le]
+    simp only [mul_neg, neg_neg]
+  refine tendsto_nhdsWithin_congr h_eq ?_
+  rw [← neg_zero]
+  refine Filter.Tendsto.neg ?_
+  simp only [neg_zero]
+  refine h.comp ?_
+  refine tendsto_abs_nhdsWithin_zero.mono_left ?_
+  refine nhdsWithin_mono 0 (fun x hx ↦ ?_)
+  simp only [Set.mem_Iio] at hx
+  simp only [Set.mem_compl_iff, Set.mem_singleton_iff, hx.ne, not_false_eq_true]
+
+lemma continuous_id_mul_log : Continuous (fun x ↦ x * log x) := by
+  rw [continuous_iff_continuousAt]
+  intro x
+  by_cases hx : x = 0
+  swap; · exact (continuous_id'.continuousAt).mul (continuousAt_log hx)
+  rw [hx]
+  have h := tendsto_log_mul_rpow_nhds_zero zero_lt_one
+  simp only [rpow_one] at h
+  have h' : Filter.Tendsto (fun x ↦ log x * x) (𝓝[<] 0) (𝓝 0) := tendsto_log_mul_nhds_zero_left
+  rw [ContinuousAt, zero_mul]
+  suffices Filter.Tendsto (fun x ↦ log x * x) (𝓝 0) (𝓝 0) by
+    exact this.congr (fun x ↦ by rw [mul_comm])
+  nth_rewrite 1 [← nhdsWithin_univ]
+  have : (Set.univ : Set ℝ) = Set.Iio 0 ∪ Set.Ioi 0 ∪ {0} := by
+    ext x
+    simp only [Set.mem_univ, Set.Iio_union_Ioi, Set.union_singleton, Set.mem_compl_iff,
+      Set.mem_singleton_iff, not_true, Set.mem_insert_iff, true_iff]
+    exact em _
+  rw [this, nhdsWithin_union, nhdsWithin_union]
+  simp only [ge_iff_le, nhdsWithin_singleton, sup_le_iff, Filter.nonpos_iff, Filter.tendsto_sup]
+  refine ⟨⟨h', h⟩, ?_⟩
+  rw [Filter.tendsto_pure_left, mul_zero]
+  intro s hs
+  obtain ⟨t, hts, _, h_zero_mem⟩ := mem_nhds_iff.mp hs
+  exact hts h_zero_mem
+
+lemma differentiableOn_id_mul_log : DifferentiableOn ℝ (fun x ↦ x * log x) {0}ᶜ :=
+  differentiable_id'.differentiableOn.mul differentiableOn_log
+
+lemma deriv_id_mul_log {x : ℝ} (hx : x ≠ 0) : deriv (fun x ↦ x * log x) x = log x + 1 := by
+  rw [deriv_mul differentiableAt_id' (differentiableAt_log hx)]
+  simp only [deriv_id'', one_mul, deriv_log', ne_eq, add_right_inj]
+  exact mul_inv_cancel hx
+
+lemma deriv2_id_mul_log {x : ℝ} (hx : x ≠ 0) : deriv^[2] (fun x ↦ x * log x) x = x⁻¹ := by
+  simp only [Function.iterate_succ, Function.iterate_zero, Function.comp.left_id,
+    Function.comp_apply]
+  suffices ∀ᶠ y in (𝓝 x), deriv (fun x ↦ x * log x) y = log y + 1 by
+    refine (Filter.EventuallyEq.deriv_eq this).trans ?_
+    rw [deriv_add_const, deriv_log x]
+  suffices ∀ᶠ y in (𝓝 x), y ≠ 0 by
+    filter_upwards [this] with y hy
+    exact deriv_id_mul_log hy
+  exact eventually_ne_nhds hx
+
+lemma strictConvexOn_id_mul_log : StrictConvexOn ℝ (Set.Ici (0 : ℝ)) (fun x ↦ x * log x) := by
+  refine strictConvexOn_of_deriv2_pos (convex_Ici 0) (continuous_id_mul_log.continuousOn) ?_
+  intro x hx
+  simp only [Set.nonempty_Iio, interior_Ici', Set.mem_Ioi] at hx
+  rw [deriv2_id_mul_log hx.ne']
+  positivity
+
+lemma convexOn_id_mul_log : ConvexOn ℝ (Set.Ici (0 : ℝ)) (fun x ↦ x * log x) :=
+  strictConvexOn_id_mul_log.convexOn
+
+lemma id_mul_log_nonneg {x : ℝ} (hx : 1 ≤ x) : 0 ≤ x * log x :=
+  mul_nonneg (zero_le_one.trans hx) (log_nonneg hx)
+
+
+section negIdMulLog
+
+/-- The function `x ↦ - x * log x` from `ℝ` to `ℝ`. -/
+noncomputable
+def negIdMulLog (x : ℝ) : ℝ := - x * log x
+
+@[simp]
+lemma negIdMulLog_zero : negIdMulLog (0 : ℝ) = 0 := by simp [negIdMulLog]
+
+@[simp]
+lemma negIdMulLog_one : negIdMulLog (1 : ℝ) = 0 := by simp [negIdMulLog]
 
-/-- The entropy function.  Note that `h 0 = 0` thanks to Lean's notational conventions. May want to
-change the name of `h` and/or localize it to a namespace. -/
-noncomputable def h := (fun x : ℝ ↦ - x * log x)
+lemma negIdMulLog_eq_neg : negIdMulLog = fun x ↦ - (x * log x) := by simp [negIdMulLog]
 
-/-- `h` is nonnegative. -/
-lemma h_nonneg (h1 : 0 ≤ x) (h2 : x ≤ 1) : 0 ≤ h x := by
-  unfold h
-  rw [neg_mul_comm]
+lemma negIdMulLog_nonneg {x : ℝ} (h1 : 0 ≤ x) (h2 : x ≤ 1) : 0 ≤ negIdMulLog x := by
+  rw [negIdMulLog, neg_mul_comm]
   apply mul_nonneg h1
-  simp
+  simp only [Left.nonneg_neg_iff]
   exact log_nonpos h1 h2
 
-@[simp] lemma h_zero : h 0 = 0 := by simp [h]
-@[simp] lemma h_one : h 1 = 0 := by simp [h]
-
-/-- a sublemma needed to get an upper bound for h. -/
-lemma log_le {x:ℝ} (hx: 0 ≤ x) : log x ≤ x / rexp 1 := by
-  rw [le_iff_lt_or_eq] at hx
-  rcases hx with hx | hx
-  . rw [<-sub_le_sub_iff_right 1]
-    convert (log_le_sub_one_of_pos (show 0 < (x * (Real.exp 1)⁻¹) by positivity)) using 1
-    rw [log_mul]
-    . simp; ring
-    all_goals positivity
-  simp [<-hx]
-
-/-- an upper bound for h that can help prove continuity at 0. -/
-lemma h_le {x : ℝ} (hx : 0 ≤ x) : h x ≤ 2 * (sqrt x) / rexp 1 := by
-  unfold h
-  rw [le_iff_lt_or_eq] at hx
-  rcases hx with hx | hx
-  . rw [neg_mul_comm, ←log_inv, ←sq_sqrt (show 0 ≤ x⁻¹ by positivity), log_pow, ←mul_assoc,
-      ←le_div_iff']
-    convert log_le (show 0 ≤ sqrt x⁻¹ by positivity) using 1
-    field_simp
-    nth_rewrite 3 [<- sq_sqrt (show 0 ≤ x by positivity)]
-    ring
-    positivity
-  simp [<-hx]
-
-/-- To prove continuity of h we will need a version of the squeeze test. -/
-lemma squeeze [TopologicalSpace α] [TopologicalSpace β] [LinearOrder β] [OrderTopology β]
-    {f g h : α → β} {x : α} (hfg : f x = g x) (hgh : g x = h x) (lower : ∀ y : α, f y ≤ g y)
-    (upper : ∀ y : α, g y ≤ h y) (f_cont : ContinuousAt f x) (h_cont : ContinuousAt h x) :
-    ContinuousAt g x := by
-  rw [continuousAt_iff_lower_upperSemicontinuousAt] at f_cont h_cont ⊢
-  dsimp [LowerSemicontinuousAt, UpperSemicontinuousAt] at f_cont h_cont ⊢
-  rw [hfg] at f_cont
-  rw [<-hgh] at h_cont
-  constructor
-  . intro a ha
-    apply Filter.Eventually.mono (f_cont.1 a ha)
-    intro x hx
-    exact lt_of_lt_of_le hx (lower x)
-  intro a ha
-  apply Filter.Eventually.mono (h_cont.2 a ha)
-  intro x hx
-  exact LE.le.trans_lt (upper x) hx
-
-/-- actually we need the squeeze test restricted to a subdomain. -/
-lemma squeezeWithin [TopologicalSpace α] [TopologicalSpace β] [LinearOrder β] [OrderTopology β]
-  {f g h : α → β} {s : Set α} {x : α} (hx : x ∈ s) (hfg : f x = g x) (hgh : g x = h x)
-  (lower : ∀ y ∈ s, f y ≤ g y) (upper : ∀ y ∈ s, g y ≤ h y) (f_cont : ContinuousWithinAt f s x)
-  (h_cont : ContinuousWithinAt h s x) : ContinuousWithinAt g s x := by
-  rw [continuousWithinAt_iff_continuousAt_restrict _ hx] at f_cont h_cont ⊢
-  set f' := Set.restrict s f
-  set g' := Set.restrict s g
-  set h' := Set.restrict s h
-  set x' : s := ⟨ x, hx ⟩
-  apply squeeze (show f' x' = g' x' by simpa) (show g' x' = h' x' by simpa) _ _ f_cont h_cont
-  . intro y; simp [lower y]
-  intro y; simp [upper y]
-
-/-- Finally, the continuity of h. -/
-lemma h_cont : ContinuousOn h (Set.Icc 0 1) := by
-  dsimp [ContinuousOn]
-  intro x hx
-  simp at hx; rcases hx with ⟨ hx1, hx2 ⟩
-  rw [le_iff_lt_or_eq] at hx1
-  rcases hx1 with hx1 | hx1
-  . refine (continuous_id.neg.continuousAt.mul (continuousAt_log ?_)).continuousWithinAt
-    linarith
--- the tricky case : continuity at zero!
-  rw [<- hx1]
-  let f := fun _ : ℝ ↦ (0:ℝ)
-  let g := fun x : ℝ ↦ (2 * sqrt x) / rexp 1
-  have f_cont : ContinuousWithinAt f (Set.Icc 0 1) 0 := by
-    apply continuousWithinAt_const
-  have g_cont : ContinuousWithinAt g (Set.Icc 0 1) 0 := by
-    apply Continuous.continuousWithinAt
-    continuity
-  apply squeezeWithin _ _ _ _ _ f_cont g_cont
-  . simp
-  . simp [h]
-  . simp [h]
-  . intro y hy
-    exact h_nonneg hy.1 hy.2
-  intro y hy
-  exact h_le hy.1
-
-/-- The derivative of h. -/
-lemma h_deriv {x : ℝ} (hx: 0 < x) : HasDerivAt h (- log x + (- 1)) x := by
-  convert hasDerivAt_id x |>.neg.mul (hasDerivAt_log ?_) using 1
-  · field_simp
-  · positivity
-
--- how to get differentiability from `HasDerivAt`
-example : DifferentiableOn ℝ h (Set.Ioo 0 1) :=
-  fun _ hx ↦ (h_deriv hx.1).differentiableAt.differentiableWithinAt
-
--- how to get the `deriv` from `HasDerivAt`
-example {x : ℝ} (hx: 0 < x) : deriv h x = - log x + (- 1) := (h_deriv hx).deriv
-
-/-- The concavity of h. -/
-lemma h_concave : ConcaveOn ℝ (Set.Icc 0 1) h := by
-  apply AntitoneOn.concaveOn_of_deriv
-  . apply convex_Icc
-  . exact h_cont
-  . rw [interior_Icc]
-    intro x hx
-    exact (h_deriv hx.1).differentiableAt.differentiableWithinAt
-  rw [interior_Icc]
-  refine ((strictMonoOn_log.monotoneOn.mono ?_).neg.add_const (-1)).congr ?_
-  · intro x hx
-    exact hx.1
-  · intro x hx
-    rw [(h_deriv hx.1).deriv]
-
-open BigOperators
-
-lemma h_jensen [Fintype S] {w : S → ℝ} {p : S → ℝ} (h0 : ∀ s ∈ Finset.univ, 0 ≤ w s)
-    (h1 : ∑ s in Finset.univ, w s = 1) (hmem : ∀ s ∈ Finset.univ, p s ∈ (Set.Icc 0 1)) :
-    ∑ s in Finset.univ, (w s) * h (p s) ≤ h ( ∑ s in Finset.univ, (w s) * (p s)) := by
-  convert ConcaveOn.le_map_sum h_concave h0 h1 hmem
+lemma concaveOn_negIdMulLog : ConcaveOn ℝ (Set.Ici (0 : ℝ)) negIdMulLog := by
+  rw [negIdMulLog_eq_neg]
+  exact convexOn_id_mul_log.neg
+
+lemma sum_negIdMulLog_le {S : Type*} [Fintype S] {w : S → ℝ} {p : S → ℝ} (h0 : ∀ s, 0 ≤ w s)
+    (h1 : ∑ s, w s = 1) (hmem : ∀ s, 0 ≤ p s) :
+    ∑ s, (w s) * negIdMulLog (p s) ≤ negIdMulLog (∑ s, (w s) * (p s)) := by
+  refine ConcaveOn.le_map_sum concaveOn_negIdMulLog ?_ h1 ?_
+  · simp [h0]
+  · simp [hmem]
+
+end negIdMulLog
+
+
+end Real