diff --git a/PFR/entropy_basic.lean b/PFR/entropy_basic.lean index dbaaa58e..03237c5b 100644 --- a/PFR/entropy_basic.lean +++ b/PFR/entropy_basic.lean @@ -1,86 +1,435 @@ -import PFR.probability_space + import PFR.neg_xlogx --- to do: be more specific on which part of Mathlib is being imported - -/-- The purpose of this file is to develop the basic theory of Shannon entropy. -/ - -/- In this file, inversion will always mean inversion of real numbers. -/ -local macro_rules | `($x ⁻¹) => `(Inv.inv ($x : ℝ)) - -open Real -open BigOperators - -variable {Ω S : Type*} [ProbabilitySpace Ω] [Fintype S] {X : Ω → S} - -/-- The entropy of a random variable. -/ -noncomputable def entropy (X : Ω → S) : ℝ := ∑ s, h (P[X ⁻¹' {s}]) - -notation:100 "H[ " X " ]" => entropy X - -/-- Entropy is non-negative -/ -lemma entropy_nonneg (X : Ω → S) : 0 ≤ H[X] := by - unfold entropy - apply Finset.sum_nonneg - intro s _ - apply h_nonneg - . simp - apply ProbabilitySpace.prob_le_one - -/-- Entropy vanishes in the degenerate case -/ -lemma entropy_zero (hΩ : ¬ProbabilitySpace.isNondeg Ω) (X : Ω → S) : H[X] = 0 := by - unfold entropy - conv => - lhs; congr; rfl; ext s - rw [ProbabilitySpace.prob_zero hΩ] - unfold h; simp - -/-- The Jensen bound -/ -lemma entropy_le_log (hX : Measurable X): H[X] ≤ log (Fintype.card S) := by - by_cases hΩ : ProbabilitySpace.isNondeg Ω - . set N := Fintype.card S - have : 0 < N := ProbabilitySpace.range_nonempty' hΩ hX - unfold entropy - have hN : log N = N * h (∑ s : S, N⁻¹ * P[ X ⁻¹' {s} ]) := by - rw [<-Finset.mul_sum] - norm_cast - rw [ProbabilitySpace.totalProb hΩ hX] +import Mathlib.Probability.Notation +import Mathlib.Probability.ConditionalProbability + +/-! +# Entropy and conditional entropy + +## Main definitions + +* `measureEntropy`: entropy of a measure +* `entropy`: entropy of a random variable, defined as `measureEntropy (volume.map X)` +* `condEntropy`: conditional entropy of a random variable `X` w.r.t. another one `Y` +* `mutualInformation`: mutual information of two random variables + +## Main statements + +* `chain_rule`: `entropy (fun ω ↦ (X ω, Y ω)) = entropy Y + condEntropy X Y` + +## Notations + +* `Hm[μ] = measureEntropy μ` +* `H[X] = entropy X` +* `H[X | Y ← y] = Hm[(ℙ[| Y ⁻¹' {y}]).map X]` +* `H[X | Y] = condEntropy X Y`, such that `H[X | Y] = (volume.map Y)[fun y ↦ H[X | Y ← y]]` +* `I[X : Y] = mutualInformation X Y` + +All notations except for `Hm[]` have variants where we can specify the measure (which is otherwise +supposed to be `volume`). For example `H[X ; μ]` and `I[X : Y ; μ]` instead of `H[X]` and +`I[X : Y]` respectively. + +-/ + +open Real MeasureTheory + +open scoped ENNReal NNReal Topology ProbabilityTheory BigOperators + +section aux_lemmas + +-- todo: is this somewhere? +lemma integral_eq_sum {S E : Type*} [Fintype S] [MeasurableSpace S] [MeasurableSingletonClass S] + [NormedAddCommGroup E] [NormedSpace ℝ E] [CompleteSpace E] + (μ : Measure S) [IsFiniteMeasure μ] (f : S → E) : + ∫ x, f x ∂μ = ∑ x, (μ {x}).toReal • (f x) := by + conv_lhs => rw [← Measure.sum_smul_dirac μ] + have hf : Integrable f μ := integrable_of_fintype _ _ + have hf' : Integrable f (Measure.sum fun a ↦ μ {a} • Measure.dirac a) := by + rwa [Measure.sum_smul_dirac μ] + rw [integral_sum_measure hf'] + simp_rw [integral_smul_measure, integral_dirac] + rw [tsum_fintype] + +/-- `μ[|s]` is a finite measure whenever `μ` is finite. -/ +instance cond_isFiniteMeasure {α : Type*} {mα : MeasurableSpace α} {μ : Measure α} + [IsFiniteMeasure μ] (s : Set α) : + IsFiniteMeasure (μ[|s]) := by + constructor + rw [ProbabilityTheory.cond] + simp only [Measure.smul_toOuterMeasure, OuterMeasure.coe_smul, Pi.smul_apply, MeasurableSet.univ, + Measure.restrict_apply, Set.univ_inter, smul_eq_mul] + by_cases hμs : μ s = 0 + · simp [hμs] + · refine ENNReal.mul_lt_top ?_ (measure_ne_top _ _) + simp only [ne_eq, ENNReal.inv_eq_top] + exact hμs + +lemma cond_eq_zero_of_measure_zero {α : Type*} {_ : MeasurableSpace α} {μ : Measure α} {s : Set α} + (hμs : μ s = 0) : + μ[|s] = 0 := by + have : μ.restrict s = 0 := by simp [hμs] + simp [ProbabilityTheory.cond, this] + +lemma sum_measure_singleton {S : Type*} [Fintype S] {_ : MeasurableSpace S} + [MeasurableSingletonClass S] (μ : Measure S) [IsProbabilityMeasure μ] : + ∑ x, μ {x} = 1 := by + change ∑ x, μ (id ⁻¹' {x}) = 1 + rw [sum_measure_preimage_singleton] + · simp + · simp + +end aux_lemmas + + + + +namespace ProbabilityTheory + +variable {Ω S T : Type*} [mΩ : MeasurableSpace Ω] + [Fintype S] [Fintype T] [MeasurableSpace S] [MeasurableSpace T] + +section measureEntropy + +variable {μ : Measure S} + +/-- Entropy of a measure on a finite measurable space. + +We normalize the measure by `(μ Set.univ)⁻¹` to extend the entropy definition to finite measures. +What we really want to do is deal with `μ=0` or `IsProbabilityMeasure μ`, but we don't have +a typeclass for that (we could create one though). +The added complexity in the expression is not an issue because if `μ` is a probability measure, +a call to `simp` will simplify `(μ Set.univ)⁻¹ • μ` to `μ`. -/ +noncomputable +def measureEntropy (μ : Measure S := by volume_tac) : ℝ := + ∑ s, negIdMulLog (((μ Set.univ)⁻¹ • μ) {s}).toReal + +lemma measureEntropy_def (μ : Measure S) : + measureEntropy μ = ∑ s, negIdMulLog (((μ Set.univ)⁻¹ • μ) {s}).toReal := rfl + +notation:100 "Hm[" μ "]" => measureEntropy μ + +@[simp] +lemma measureEntropy_zero : Hm[(0 : Measure S)] = 0 := by + simp [measureEntropy] + +lemma measureEntropy_of_not_isFiniteMeasure (h : ¬ IsFiniteMeasure μ) : + Hm[μ] = 0 := by + simp [measureEntropy, not_isFiniteMeasure_iff.mp h] + +lemma measureEntropy_of_isProbabilityMeasure (μ : Measure S) [IsProbabilityMeasure μ] : + Hm[μ] = ∑ s, negIdMulLog (μ {s}).toReal := by + simp [measureEntropy] + +lemma measureEntropy_univ_smul : Hm[(μ Set.univ)⁻¹ • μ] = Hm[μ] := by + by_cases hμ_fin : IsFiniteMeasure μ + swap + · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin] + rw [not_isFiniteMeasure_iff] at hμ_fin + simp [hμ_fin] + cases eq_zero_or_neZero μ with + | inl hμ => simp [hμ] + | inr hμ => + rw [measureEntropy_def] + simp only [Measure.smul_toOuterMeasure, OuterMeasure.coe_smul, Pi.smul_apply, smul_eq_mul, + ENNReal.toReal_mul] + rw [ENNReal.inv_mul_cancel] + · simp only [inv_one, ENNReal.one_toReal, one_mul] + simp [measureEntropy] + · simp [hμ.out] + · exact measure_ne_top _ _ + +lemma measureEntropy_nonneg (μ : Measure S) : 0 ≤ Hm[μ] := by + by_cases hμ_fin : IsFiniteMeasure μ + swap; · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin] + refine Finset.sum_nonneg (fun s _ ↦ negIdMulLog_nonneg ENNReal.toReal_nonneg ?_) + refine ENNReal.toReal_le_of_le_ofReal zero_le_one ?_ + rw [ENNReal.ofReal_one] + cases eq_zero_or_neZero μ with + | inl hμ => simp [hμ] + | inr hμ => exact prob_le_one + +lemma measureEntropy_le_card_aux [MeasurableSingletonClass S] + (μ : Measure S) [IsProbabilityMeasure μ] : + Hm[μ] ≤ log (Fintype.card S) := by + cases isEmpty_or_nonempty S with + | inl h => + have : μ = 0 := Subsingleton.elim _ _ + simp [Fintype.card_eq_zero, this] + | inr h => + set N := Fintype.card S + have : 0 < N := Fintype.card_pos + rw [measureEntropy_def] + simp only [measure_univ, inv_one, one_smul] + calc ∑ x, negIdMulLog (μ {x}).toReal + = ∑ x, negIdMulLog (μ {x}).toReal := rfl + _ = N * ∑ x, (N : ℝ)⁻¹ * negIdMulLog (μ {x}).toReal := by + rw [Finset.mul_sum] + congr with x + rw [← mul_assoc, mul_inv_cancel, one_mul] + simp [this.ne'] + _ ≤ N * negIdMulLog (∑ x : S, (N : ℝ)⁻¹ * (μ {x}).toReal) := by + refine mul_le_mul le_rfl ?_ ?_ ?_ + · refine sum_negIdMulLog_le ?_ ?_ ?_ + · simp + · simp [Finset.card_univ] + · exact fun _ ↦ ENNReal.toReal_nonneg + · refine Finset.sum_nonneg (fun x _ ↦ ?_) + refine mul_nonneg ?_ ?_ + · simp [this] + · refine negIdMulLog_nonneg (by simp) ?_ + refine ENNReal.toReal_le_of_le_ofReal zero_le_one ?_ + rw [ENNReal.ofReal_one] + exact prob_le_one + · positivity + _ = N * negIdMulLog ((N : ℝ)⁻¹) := by + congr + rw [← Finset.mul_sum] + simp only [ne_eq, inv_eq_zero, Nat.cast_eq_zero, Fintype.card_ne_zero, not_false_eq_true, + mul_eq_left₀] + rw [← ENNReal.toReal_sum (fun _ _ ↦ measure_ne_top _ _), ENNReal.toReal_eq_one_iff] + exact sum_measure_singleton _ + _ = - log ((N : ℝ)⁻¹) := by + simp [negIdMulLog] + _ = log (Fintype.card S) := by simp [Finset.card_univ] + +lemma measureEntropy_le_log_card [MeasurableSingletonClass S] (μ : Measure S) : + Hm[μ] ≤ log (Fintype.card S) := by + have h_log_card_nonneg : 0 ≤ log (Fintype.card S) := by + cases isEmpty_or_nonempty S with + | inl h => + rw [Fintype.card_eq_zero] simp - unfold h - rw [log_inv] - field_simp; ring - rw [hN, <- inv_mul_le_iff, Finset.mul_sum] - set w := fun _ : S ↦ N⁻¹ - set p := fun s : S ↦ (P[ X ⁻¹' {s} ] : ℝ) - - conv => - congr - . congr; rfl - ext s - rw [(show N⁻¹ = w s by simp), (show P[ X ⁻¹' {s} ] = p s by simp)] - congr; congr; rfl - ext s - rw [(show N⁻¹ = w s by simp), (show P[ X ⁻¹' {s} ] = p s by simp)] - apply h_jensen - . intros; simp - . simp; apply mul_inv_cancel; positivity - . intro s _ - simp; norm_cast - exact ProbabilitySpace.prob_le_one (X ⁻¹' {s}) - positivity - rw [entropy_zero hΩ] - positivity - -/-- Equality in Jensen is attained when X is uniform. TODO: also establish converse. One could -also remove hΩ but this seems of little use. -/ -lemma entropy_of_uniform (hΩ : ProbabilitySpace.isNondeg Ω) (hX : ProbabilitySpace.isUniform X) : - H[X] = log (Fintype.card S) := by - rcases hX with ⟨ hX1, hX2 ⟩ - unfold entropy - conv => - lhs; congr; rfl; ext s - rw [hX2 s] - simp [h] - have := ProbabilitySpace.range_nonempty' hΩ hX1 - field_simp - rw [mul_comm] + | inr h => + refine log_nonneg ?_ + simp only [Nat.one_le_cast] + exact Fintype.card_pos + cases eq_zero_or_neZero μ with + | inl hμ => + simp only [hμ, measureEntropy_zero] + exact h_log_card_nonneg + | inr hμ => + by_cases hμ_fin : IsFiniteMeasure μ + swap; + · rw [measureEntropy_of_not_isFiniteMeasure hμ_fin] + exact h_log_card_nonneg + rw [← measureEntropy_univ_smul] + exact measureEntropy_le_card_aux _ + +end measureEntropy + +section entropy + +variable {μ : Measure Ω} {X : Ω → S} {Y : Ω → T} + +/-- Entropy of a random variable with values in a finite measurable space. -/ +noncomputable +def entropy (X : Ω → S) (μ : Measure Ω := by volume_tac) := Hm[μ.map X] + +notation3:100 "H[" X "; " μ "]" => entropy X μ +notation3:100 "H[" X "]" => entropy X volume +notation3:100 "H[" X "|" Y "←" y "; " μ "]" => entropy X (μ[|Y ⁻¹' {y}]) +notation3:100 "H[" X "|" Y "←" y "]" => entropy X (ℙ[|Y ⁻¹' {y}]) + +lemma entropy_def (X : Ω → S) (μ : Measure Ω) : entropy X μ = Hm[μ.map X] := rfl + +lemma entropy_nonneg (X : Ω → S) (μ : Measure Ω) : 0 ≤ entropy X μ := measureEntropy_nonneg _ + +lemma entropy_le_log_card [MeasurableSingletonClass S] + (X : Ω → S) (μ : Measure Ω) : entropy X μ ≤ log (Fintype.card S) := + measureEntropy_le_log_card _ + +lemma entropy_eq_sum (hX : Measurable X) (μ : Measure Ω) [IsProbabilityMeasure μ] : + entropy X μ = ∑ x, negIdMulLog (μ.map X {x}).toReal := by + have : IsProbabilityMeasure (Measure.map X μ) := isProbabilityMeasure_map hX.aemeasurable + rw [entropy_def, measureEntropy_of_isProbabilityMeasure] + +lemma entropy_cond_eq_sum (hX : Measurable X) (μ : Measure Ω) [IsProbabilityMeasure μ] (y : T) : + H[X | Y ← y ; μ] = ∑ x, negIdMulLog ((μ[|Y ⁻¹' {y}]).map X {x}).toReal := by + by_cases hy : μ (Y ⁻¹' {y}) = 0 + · rw [entropy_def, cond_eq_zero_of_measure_zero hy] + simp + · have : IsProbabilityMeasure (μ[|Y ⁻¹' {y}]) := cond_isProbabilityMeasure _ hy + rw [entropy_eq_sum hX] + +end entropy + +section condEntropy + +variable {X : Ω → S} {Y : Ω → T} + +/-- Conditional entropy of a random variable w.r.t. another. +This is the expectation under the law of `Y` of the entropy of the law of `X` conditioned on the +event `Y = y`. -/ +noncomputable +def condEntropy (X : Ω → S) (Y : Ω → T) (μ : Measure Ω := by volume_tac) : ℝ := + (μ.map Y)[fun y ↦ H[X | Y ← y ; μ]] + +lemma condEntropy_def (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) : + condEntropy X Y μ = (μ.map Y)[fun y ↦ H[X | Y ← y ; μ]] := rfl + +notation3:100 "H[" X "|" Y "; " μ "]" => condEntropy X Y μ +notation3:100 "H[" X "|" Y "]" => condEntropy X Y volume + +lemma condEntropy_nonneg (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) : 0 ≤ H[X | Y ; μ] := + integral_nonneg (fun _ ↦ measureEntropy_nonneg _) + +lemma condEntropy_le_log_card [MeasurableSingletonClass S] + (X : Ω → S) (Y : Ω → T) (hY : Measurable Y) (μ : Measure Ω) [IsProbabilityMeasure μ] : + H[X | Y ; μ] ≤ log (Fintype.card S) := by + refine (integral_mono_of_nonneg ?_ (integrable_const (log (Fintype.card S))) ?_).trans ?_ + · exact ae_of_all _ (fun _ ↦ entropy_nonneg _ _) + · exact ae_of_all _ (fun _ ↦ entropy_le_log_card _ _) + · have : IsProbabilityMeasure (μ.map Y) := isProbabilityMeasure_map hY.aemeasurable + simp + +lemma condEntropy_eq_sum [MeasurableSingletonClass T] (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) + [IsProbabilityMeasure μ] : + H[X | Y ; μ] = ∑ y, (μ.map Y {y}).toReal * H[X | Y ← y ; μ] := by + rw [condEntropy_def, integral_eq_sum] + simp_rw [smul_eq_mul] + +lemma condEntropy_eq_sum_sum [MeasurableSingletonClass T] (hX : Measurable X) (Y : Ω → T) + (μ : Measure Ω) [IsProbabilityMeasure μ] : + H[X | Y ; μ] + = ∑ y, ∑ x, (μ.map Y {y}).toReal * negIdMulLog ((μ[|Y ⁻¹' {y}]).map X {x}).toReal := by + rw [condEntropy_eq_sum] + congr with y + rw [entropy_cond_eq_sum hX, Finset.mul_sum] + +lemma condEntropy_eq_sum_prod [MeasurableSingletonClass T] (hX : Measurable X) (Y : Ω → T) + (μ : Measure Ω) [IsProbabilityMeasure μ] : + H[X | Y ; μ] = ∑ p : S × T, + (μ.map Y {p.2}).toReal * negIdMulLog ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by + have h_prod : (Finset.univ : Finset (S × T)) = (Finset.univ : Finset S) ×ˢ Finset.univ := rfl + rw [condEntropy_eq_sum_sum hX Y, h_prod, Finset.sum_product_right] + +end condEntropy + +section pair + +variable {X : Ω → S} {Y : Ω → T} + +lemma measure_prod_singleton_eq_mul [MeasurableSingletonClass S] [MeasurableSingletonClass T] + {μ : Measure Ω} [IsFiniteMeasure μ] (hX : Measurable X) (hY : Measurable Y) {p : S × T} : + (μ.map (fun ω ↦ (X ω, Y ω)) {p}).toReal + = (μ.map Y {p.2}).toReal * ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by + have hp_prod : {p} = {p.1} ×ˢ {p.2} := by simp + rw [Measure.map_apply (hX.prod_mk hY) (measurableSet_singleton p)] + by_cases hpY : μ (Y ⁻¹' {p.2}) = 0 + · rw [cond_eq_zero_of_measure_zero hpY] + simp only [aemeasurable_zero_measure, not_true, Measure.map_zero, Measure.zero_toOuterMeasure, + OuterMeasure.coe_zero, Pi.zero_apply, ENNReal.zero_toReal, mul_zero] + suffices (μ ((fun a ↦ (X a, Y a)) ⁻¹' ({p.1} ×ˢ {p.2}))).toReal = 0 by convert this + rw [Set.mk_preimage_prod, ENNReal.toReal_eq_zero_iff] + exact Or.inl (measure_mono_null (Set.inter_subset_right _ _) hpY) + rw [Measure.map_apply hY (measurableSet_singleton p.2)] + simp_rw [Measure.map_apply hX (measurableSet_singleton _)] + simp_rw [cond_apply _ (hY (measurableSet_singleton _))] + rw [ENNReal.toReal_mul, ← mul_assoc, ENNReal.toReal_inv, mul_inv_cancel, one_mul, hp_prod, + Set.mk_preimage_prod, Set.inter_comm] + rw [ENNReal.toReal_ne_zero]; exact ⟨hpY, measure_ne_top _ _⟩ + +lemma negIdMulLog_measure_prod_singleton [MeasurableSingletonClass S] [MeasurableSingletonClass T] + {μ : Measure Ω} [IsFiniteMeasure μ] (hX : Measurable X) (hY : Measurable Y) {p : S × T} : + negIdMulLog (μ.map (fun ω ↦ (X ω, Y ω)) {p}).toReal + = - ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal + * (μ.map Y {p.2}).toReal* log (μ.map Y {p.2}).toReal + - (μ.map Y {p.2}).toReal * ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal + * log ((μ[|Y ⁻¹' {p.2}]).map X {p.1}).toReal := by + rw [measure_prod_singleton_eq_mul hX hY] + by_cases hpY : μ (Y ⁻¹' {p.2}) = 0 + · rw [cond_eq_zero_of_measure_zero hpY] + simp + by_cases hpX : (μ[|Y ⁻¹' {p.2}]).map X {p.1} = 0 + · simp [hpX] + rw [negIdMulLog, log_mul] + · ring + · rw [ENNReal.toReal_ne_zero] + refine ⟨?_, measure_ne_top _ _⟩ + rwa [Measure.map_apply hY (measurableSet_singleton _)] + · rw [ENNReal.toReal_ne_zero] + refine ⟨hpX, measure_ne_top _ _⟩ + +lemma chain_rule [MeasurableSingletonClass S] [MeasurableSingletonClass T] + (μ : Measure Ω) [IsProbabilityMeasure μ] (hX : Measurable X) (hY : Measurable Y) : + H[fun ω ↦ (X ω, Y ω) ; μ] = H[Y ; μ] + H[X | Y ; μ] := by + have : IsProbabilityMeasure (μ.map Y) := isProbabilityMeasure_map hY.aemeasurable + have : IsProbabilityMeasure (μ.map (fun ω ↦ (X ω, Y ω))) := + isProbabilityMeasure_map (hX.prod_mk hY).aemeasurable + rw [entropy_def, measureEntropy_of_isProbabilityMeasure] + simp_rw [negIdMulLog_measure_prod_singleton hX hY, sub_eq_add_neg, Finset.sum_add_distrib] congr + · have h_prod : (Finset.univ : Finset (S × T)) = (Finset.univ : Finset S) ×ˢ Finset.univ := rfl + rw [h_prod, Finset.sum_product_right, entropy_def, measureEntropy_of_isProbabilityMeasure] + congr with y + simp only [neg_mul, Finset.sum_neg_distrib] + rw [← Finset.sum_mul, ← Finset.sum_mul] + by_cases hy : μ (Y ⁻¹' {y}) = 0 + · simp [cond_eq_zero_of_measure_zero hy, Measure.map_apply hY, hy] + have : IsProbabilityMeasure (μ[|Y ⁻¹' {y}]) := cond_isProbabilityMeasure _ hy + suffices ∑ x : S, ((μ[|Y ⁻¹' {y}]).map X {x}).toReal = 1 by + rw [this, one_mul, ← neg_mul, negIdMulLog] + rw [← ENNReal.toReal_sum (fun _ _ ↦ measure_ne_top _ _), ENNReal.toReal_eq_one_iff] + simp_rw [Measure.map_apply hX (measurableSet_singleton _)] + rw [sum_measure_preimage_singleton _ (fun y _ ↦ hX (measurableSet_singleton y))] + simp + · rw [condEntropy_eq_sum_prod hX] + congr with p + rw [negIdMulLog] + ring + +end pair + +section mutualInformation + +variable {U : Type*} [Fintype U] [MeasurableSpace U] + {X : Ω → S} {Y : Ω → T} {Z : Ω → U} {μ : Measure Ω} + +/-- Mutual information (TODO docstring). -/ +noncomputable +def mutualInformation (X : Ω → S) (Y : Ω → T) (μ : Measure Ω := by volume_tac) : ℝ := + H[X ; μ] + H[Y ; μ] - H[fun ω ↦ (X ω, Y ω) ; μ] + +lemma mutualInformation_def (X : Ω → S) (Y : Ω → T) (μ : Measure Ω) : + mutualInformation X Y μ = H[X ; μ] + H[Y ; μ] - H[fun ω ↦ (X ω, Y ω) ; μ] := rfl + +notation3:100 "I[" X ":" Y ";" μ "]" => mutualInformation X Y μ +notation3:100 "I[" X ":" Y "]" => mutualInformation X Y volume + +lemma mutualInformation_eq_entropy_sub_condEntropy [MeasurableSingletonClass S] + [MeasurableSingletonClass T] (hX : Measurable X) (hY : Measurable Y) (μ : Measure Ω) + [IsProbabilityMeasure μ] : + I[X : Y ; μ] = H[X ; μ] - H[X | Y ; μ] := by + rw [mutualInformation_def, chain_rule μ hX hY] + abel + +noncomputable +def condMutualInformation (X : Ω → S) (Y : Ω → T) (Z : Ω → U) (μ : Measure Ω := by volume_tac) : + ℝ := (μ.map Z)[fun z ↦ H[X | Z ← z ; μ] + H[Y | Z ← z ; μ] - H[fun ω ↦ (X ω, Y ω) | Z ← z ; μ]] + +lemma condMutualInformation_def (X : Ω → S) (Y : Ω → T) (Z : Ω → U) (μ : Measure Ω) : + condMutualInformation X Y Z μ = (μ.map Z)[fun z ↦ + H[X | Z ← z ; μ] + H[Y | Z ← z ; μ] - H[fun ω ↦ (X ω, Y ω) | Z ← z ; μ]] := rfl + +lemma condMutualInformation_eq_integral_mutualInformation : + condMutualInformation X Y Z μ = (μ.map Z)[fun z ↦ I[X : Y ; μ[|Z ⁻¹' {z}]]] := rfl + +end mutualInformation + +end ProbabilityTheory + + + +section MeasureSpace_example + +open ProbabilityTheory + +variable {Ω S T : Type*} [MeasureSpace Ω] [IsProbabilityMeasure (ℙ : Measure Ω)] + [Fintype S] [MeasurableSpace S] [MeasurableSingletonClass S] + [Fintype T] [MeasurableSpace T] [MeasurableSingletonClass T] + {X : Ω → S} {Y : Ω → T} + +example (hX : Measurable X) (hY : Measurable Y) : + H[fun ω ↦ (X ω, Y ω)] = H[Y] + H[X | Y] := chain_rule _ hX hY + +end MeasureSpace_example diff --git a/PFR/neg_xlogx.lean b/PFR/neg_xlogx.lean index 2b20de42..ca6a9637 100644 --- a/PFR/neg_xlogx.lean +++ b/PFR/neg_xlogx.lean @@ -1,4 +1,5 @@ import Mathlib.Analysis.SpecialFunctions.Log.Deriv +import Mathlib.Analysis.SpecialFunctions.Pow.Asymptotics import Mathlib.Topology.Semicontinuous /-! @@ -7,147 +8,123 @@ import Mathlib.Topology.Semicontinuous The purpose of this file is to record basic analytic properties of the function $$h(x) = - x * log x$$ on the unit interval, for use in the theory of Shannon entropy. -Thanks to Heather Macbeth for optimizations. -/ -open Real +open scoped ENNReal NNReal Topology BigOperators -variable {x : ℝ} +namespace Real -/- In this file, inversion will always mean inversion of real numbers. -/ -local macro_rules | `($x ⁻¹) => `(Inv.inv ($x : ℝ)) +lemma tendsto_log_mul_nhds_zero_left : + Filter.Tendsto (fun x ↦ log x * x) (𝓝[<] 0) (𝓝 0) := by + have h := tendsto_log_mul_rpow_nhds_zero zero_lt_one + simp only [rpow_one] at h + have h_eq : ∀ x ∈ Set.Iio 0, (- (fun x ↦ log x * x) ∘ (fun x ↦ |x|)) x = log x * x := by + intro x hx + simp only [Set.mem_Iio] at hx + simp only [Pi.neg_apply, Function.comp_apply, log_abs] + rw [abs_of_nonpos hx.le] + simp only [mul_neg, neg_neg] + refine tendsto_nhdsWithin_congr h_eq ?_ + rw [← neg_zero] + refine Filter.Tendsto.neg ?_ + simp only [neg_zero] + refine h.comp ?_ + refine tendsto_abs_nhdsWithin_zero.mono_left ?_ + refine nhdsWithin_mono 0 (fun x hx ↦ ?_) + simp only [Set.mem_Iio] at hx + simp only [Set.mem_compl_iff, Set.mem_singleton_iff, hx.ne, not_false_eq_true] + +lemma continuous_id_mul_log : Continuous (fun x ↦ x * log x) := by + rw [continuous_iff_continuousAt] + intro x + by_cases hx : x = 0 + swap; · exact (continuous_id'.continuousAt).mul (continuousAt_log hx) + rw [hx] + have h := tendsto_log_mul_rpow_nhds_zero zero_lt_one + simp only [rpow_one] at h + have h' : Filter.Tendsto (fun x ↦ log x * x) (𝓝[<] 0) (𝓝 0) := tendsto_log_mul_nhds_zero_left + rw [ContinuousAt, zero_mul] + suffices Filter.Tendsto (fun x ↦ log x * x) (𝓝 0) (𝓝 0) by + exact this.congr (fun x ↦ by rw [mul_comm]) + nth_rewrite 1 [← nhdsWithin_univ] + have : (Set.univ : Set ℝ) = Set.Iio 0 ∪ Set.Ioi 0 ∪ {0} := by + ext x + simp only [Set.mem_univ, Set.Iio_union_Ioi, Set.union_singleton, Set.mem_compl_iff, + Set.mem_singleton_iff, not_true, Set.mem_insert_iff, true_iff] + exact em _ + rw [this, nhdsWithin_union, nhdsWithin_union] + simp only [ge_iff_le, nhdsWithin_singleton, sup_le_iff, Filter.nonpos_iff, Filter.tendsto_sup] + refine ⟨⟨h', h⟩, ?_⟩ + rw [Filter.tendsto_pure_left, mul_zero] + intro s hs + obtain ⟨t, hts, _, h_zero_mem⟩ := mem_nhds_iff.mp hs + exact hts h_zero_mem + +lemma differentiableOn_id_mul_log : DifferentiableOn ℝ (fun x ↦ x * log x) {0}ᶜ := + differentiable_id'.differentiableOn.mul differentiableOn_log + +lemma deriv_id_mul_log {x : ℝ} (hx : x ≠ 0) : deriv (fun x ↦ x * log x) x = log x + 1 := by + rw [deriv_mul differentiableAt_id' (differentiableAt_log hx)] + simp only [deriv_id'', one_mul, deriv_log', ne_eq, add_right_inj] + exact mul_inv_cancel hx + +lemma deriv2_id_mul_log {x : ℝ} (hx : x ≠ 0) : deriv^[2] (fun x ↦ x * log x) x = x⁻¹ := by + simp only [Function.iterate_succ, Function.iterate_zero, Function.comp.left_id, + Function.comp_apply] + suffices ∀ᶠ y in (𝓝 x), deriv (fun x ↦ x * log x) y = log y + 1 by + refine (Filter.EventuallyEq.deriv_eq this).trans ?_ + rw [deriv_add_const, deriv_log x] + suffices ∀ᶠ y in (𝓝 x), y ≠ 0 by + filter_upwards [this] with y hy + exact deriv_id_mul_log hy + exact eventually_ne_nhds hx + +lemma strictConvexOn_id_mul_log : StrictConvexOn ℝ (Set.Ici (0 : ℝ)) (fun x ↦ x * log x) := by + refine strictConvexOn_of_deriv2_pos (convex_Ici 0) (continuous_id_mul_log.continuousOn) ?_ + intro x hx + simp only [Set.nonempty_Iio, interior_Ici', Set.mem_Ioi] at hx + rw [deriv2_id_mul_log hx.ne'] + positivity + +lemma convexOn_id_mul_log : ConvexOn ℝ (Set.Ici (0 : ℝ)) (fun x ↦ x * log x) := + strictConvexOn_id_mul_log.convexOn + +lemma id_mul_log_nonneg {x : ℝ} (hx : 1 ≤ x) : 0 ≤ x * log x := + mul_nonneg (zero_le_one.trans hx) (log_nonneg hx) + + +section negIdMulLog + +/-- The function `x ↦ - x * log x` from `ℝ` to `ℝ`. -/ +noncomputable +def negIdMulLog (x : ℝ) : ℝ := - x * log x + +@[simp] +lemma negIdMulLog_zero : negIdMulLog (0 : ℝ) = 0 := by simp [negIdMulLog] + +@[simp] +lemma negIdMulLog_one : negIdMulLog (1 : ℝ) = 0 := by simp [negIdMulLog] -/-- The entropy function. Note that `h 0 = 0` thanks to Lean's notational conventions. May want to -change the name of `h` and/or localize it to a namespace. -/ -noncomputable def h := (fun x : ℝ ↦ - x * log x) +lemma negIdMulLog_eq_neg : negIdMulLog = fun x ↦ - (x * log x) := by simp [negIdMulLog] -/-- `h` is nonnegative. -/ -lemma h_nonneg (h1 : 0 ≤ x) (h2 : x ≤ 1) : 0 ≤ h x := by - unfold h - rw [neg_mul_comm] +lemma negIdMulLog_nonneg {x : ℝ} (h1 : 0 ≤ x) (h2 : x ≤ 1) : 0 ≤ negIdMulLog x := by + rw [negIdMulLog, neg_mul_comm] apply mul_nonneg h1 - simp + simp only [Left.nonneg_neg_iff] exact log_nonpos h1 h2 -@[simp] lemma h_zero : h 0 = 0 := by simp [h] -@[simp] lemma h_one : h 1 = 0 := by simp [h] - -/-- a sublemma needed to get an upper bound for h. -/ -lemma log_le {x:ℝ} (hx: 0 ≤ x) : log x ≤ x / rexp 1 := by - rw [le_iff_lt_or_eq] at hx - rcases hx with hx | hx - . rw [<-sub_le_sub_iff_right 1] - convert (log_le_sub_one_of_pos (show 0 < (x * (Real.exp 1)⁻¹) by positivity)) using 1 - rw [log_mul] - . simp; ring - all_goals positivity - simp [<-hx] - -/-- an upper bound for h that can help prove continuity at 0. -/ -lemma h_le {x : ℝ} (hx : 0 ≤ x) : h x ≤ 2 * (sqrt x) / rexp 1 := by - unfold h - rw [le_iff_lt_or_eq] at hx - rcases hx with hx | hx - . rw [neg_mul_comm, ←log_inv, ←sq_sqrt (show 0 ≤ x⁻¹ by positivity), log_pow, ←mul_assoc, - ←le_div_iff'] - convert log_le (show 0 ≤ sqrt x⁻¹ by positivity) using 1 - field_simp - nth_rewrite 3 [<- sq_sqrt (show 0 ≤ x by positivity)] - ring - positivity - simp [<-hx] - -/-- To prove continuity of h we will need a version of the squeeze test. -/ -lemma squeeze [TopologicalSpace α] [TopologicalSpace β] [LinearOrder β] [OrderTopology β] - {f g h : α → β} {x : α} (hfg : f x = g x) (hgh : g x = h x) (lower : ∀ y : α, f y ≤ g y) - (upper : ∀ y : α, g y ≤ h y) (f_cont : ContinuousAt f x) (h_cont : ContinuousAt h x) : - ContinuousAt g x := by - rw [continuousAt_iff_lower_upperSemicontinuousAt] at f_cont h_cont ⊢ - dsimp [LowerSemicontinuousAt, UpperSemicontinuousAt] at f_cont h_cont ⊢ - rw [hfg] at f_cont - rw [<-hgh] at h_cont - constructor - . intro a ha - apply Filter.Eventually.mono (f_cont.1 a ha) - intro x hx - exact lt_of_lt_of_le hx (lower x) - intro a ha - apply Filter.Eventually.mono (h_cont.2 a ha) - intro x hx - exact LE.le.trans_lt (upper x) hx - -/-- actually we need the squeeze test restricted to a subdomain. -/ -lemma squeezeWithin [TopologicalSpace α] [TopologicalSpace β] [LinearOrder β] [OrderTopology β] - {f g h : α → β} {s : Set α} {x : α} (hx : x ∈ s) (hfg : f x = g x) (hgh : g x = h x) - (lower : ∀ y ∈ s, f y ≤ g y) (upper : ∀ y ∈ s, g y ≤ h y) (f_cont : ContinuousWithinAt f s x) - (h_cont : ContinuousWithinAt h s x) : ContinuousWithinAt g s x := by - rw [continuousWithinAt_iff_continuousAt_restrict _ hx] at f_cont h_cont ⊢ - set f' := Set.restrict s f - set g' := Set.restrict s g - set h' := Set.restrict s h - set x' : s := ⟨ x, hx ⟩ - apply squeeze (show f' x' = g' x' by simpa) (show g' x' = h' x' by simpa) _ _ f_cont h_cont - . intro y; simp [lower y] - intro y; simp [upper y] - -/-- Finally, the continuity of h. -/ -lemma h_cont : ContinuousOn h (Set.Icc 0 1) := by - dsimp [ContinuousOn] - intro x hx - simp at hx; rcases hx with ⟨ hx1, hx2 ⟩ - rw [le_iff_lt_or_eq] at hx1 - rcases hx1 with hx1 | hx1 - . refine (continuous_id.neg.continuousAt.mul (continuousAt_log ?_)).continuousWithinAt - linarith --- the tricky case : continuity at zero! - rw [<- hx1] - let f := fun _ : ℝ ↦ (0:ℝ) - let g := fun x : ℝ ↦ (2 * sqrt x) / rexp 1 - have f_cont : ContinuousWithinAt f (Set.Icc 0 1) 0 := by - apply continuousWithinAt_const - have g_cont : ContinuousWithinAt g (Set.Icc 0 1) 0 := by - apply Continuous.continuousWithinAt - continuity - apply squeezeWithin _ _ _ _ _ f_cont g_cont - . simp - . simp [h] - . simp [h] - . intro y hy - exact h_nonneg hy.1 hy.2 - intro y hy - exact h_le hy.1 - -/-- The derivative of h. -/ -lemma h_deriv {x : ℝ} (hx: 0 < x) : HasDerivAt h (- log x + (- 1)) x := by - convert hasDerivAt_id x |>.neg.mul (hasDerivAt_log ?_) using 1 - · field_simp - · positivity - --- how to get differentiability from `HasDerivAt` -example : DifferentiableOn ℝ h (Set.Ioo 0 1) := - fun _ hx ↦ (h_deriv hx.1).differentiableAt.differentiableWithinAt - --- how to get the `deriv` from `HasDerivAt` -example {x : ℝ} (hx: 0 < x) : deriv h x = - log x + (- 1) := (h_deriv hx).deriv - -/-- The concavity of h. -/ -lemma h_concave : ConcaveOn ℝ (Set.Icc 0 1) h := by - apply AntitoneOn.concaveOn_of_deriv - . apply convex_Icc - . exact h_cont - . rw [interior_Icc] - intro x hx - exact (h_deriv hx.1).differentiableAt.differentiableWithinAt - rw [interior_Icc] - refine ((strictMonoOn_log.monotoneOn.mono ?_).neg.add_const (-1)).congr ?_ - · intro x hx - exact hx.1 - · intro x hx - rw [(h_deriv hx.1).deriv] - -open BigOperators - -lemma h_jensen [Fintype S] {w : S → ℝ} {p : S → ℝ} (h0 : ∀ s ∈ Finset.univ, 0 ≤ w s) - (h1 : ∑ s in Finset.univ, w s = 1) (hmem : ∀ s ∈ Finset.univ, p s ∈ (Set.Icc 0 1)) : - ∑ s in Finset.univ, (w s) * h (p s) ≤ h ( ∑ s in Finset.univ, (w s) * (p s)) := by - convert ConcaveOn.le_map_sum h_concave h0 h1 hmem +lemma concaveOn_negIdMulLog : ConcaveOn ℝ (Set.Ici (0 : ℝ)) negIdMulLog := by + rw [negIdMulLog_eq_neg] + exact convexOn_id_mul_log.neg + +lemma sum_negIdMulLog_le {S : Type*} [Fintype S] {w : S → ℝ} {p : S → ℝ} (h0 : ∀ s, 0 ≤ w s) + (h1 : ∑ s, w s = 1) (hmem : ∀ s, 0 ≤ p s) : + ∑ s, (w s) * negIdMulLog (p s) ≤ negIdMulLog (∑ s, (w s) * (p s)) := by + refine ConcaveOn.le_map_sum concaveOn_negIdMulLog ?_ h1 ?_ + · simp [h0] + · simp [hmem] + +end negIdMulLog + + +end Real