Module Fappli_IEEE_extra


Additional operations and proofs about IEEE-754 binary floating-point numbers, on top of the Flocq library.

Require Import Psatz.
Require Import Bool.
Require Import Eqdep_dec.
Require Import Fcore.
Require Import Fcore_digits.
Require Import Fcalc_digits.
Require Import Fcalc_ops.
Require Import Fcalc_round.
Require Import Fcalc_bracket.
Require Import Fprop_Sterbenz.
Require Import Fappli_IEEE.
Require Import Fappli_rnd_odd.

Local Open Scope Z_scope.

Section Extra_ops.

prec is the number of bits of the mantissa including the implicit one. emax is the exponent of the infinities. Typically p=24 and emax = 128 in single precision.

Variable prec emax : Z.
Context (prec_gt_0_ : Prec_gt_0 prec).
Let emin := (3 - emax - prec)%Z.
Let fexp := FLT_exp emin prec.
Hypothesis Hmax : (prec < emax)%Z.
Let binary_float := binary_float prec emax.

Remarks on is_finite

Remark is_finite_not_is_nan:
  forall (f: binary_float), is_finite _ _ f = true -> is_nan _ _ f = false.
Proof.

Remark is_finite_strict_finite:
  forall (f: binary_float), is_finite_strict _ _ f = true -> is_finite _ _ f = true.
Proof.

Digression on FP numbers that cannot be -0.0.

Definition is_finite_pos0 (f: binary_float) : bool :=
  match f with
  | B754_zero _ _ s => negb s
  | B754_infinity _ _ _ => false
  | B754_nan _ _ _ _ => false
  | B754_finite _ _ _ _ _ _ => true
  end.

Lemma Bsign_pos0:
  forall x, is_finite_pos0 x = true -> Bsign _ _ x = Rlt_bool (B2R _ _ x) 0%R.
Proof.

Theorem B2R_inj_pos0:
  forall x y,
  is_finite_pos0 x = true -> is_finite_pos0 y = true ->
  B2R _ _ x = B2R _ _ y ->
  x = y.
Proof.

Decidable equality


Definition Beq_dec: forall (f1 f2: binary_float), {f1 = f2} + {f1 <> f2}.
Proof.

Conversion from an integer to a FP number


Integers that can be represented exactly as FP numbers.

Definition integer_representable (n: Z): Prop :=
  Z.abs n <= 2^emax - 2^(emax - prec) /\ generic_format radix2 fexp (Z2R n).

Let int_upper_bound_eq: 2^emax - 2^(emax - prec) = (2^prec - 1) * 2^(emax - prec).
Proof.

Lemma integer_representable_n2p:
  forall n p,
  -2^prec < n < 2^prec -> 0 <= p -> p <= emax - prec ->
  integer_representable (n * 2^p).
Proof.

Lemma integer_representable_2p:
  forall p,
  0 <= p <= emax - 1 ->
  integer_representable (2^p).
Proof.

Lemma integer_representable_opp:
  forall n, integer_representable n -> integer_representable (-n).
Proof.

Lemma integer_representable_n2p_wide:
  forall n p,
  -2^prec <= n <= 2^prec -> 0 <= p -> p < emax - prec ->
  integer_representable (n * 2^p).
Proof.

Lemma integer_representable_n:
  forall n, -2^prec <= n <= 2^prec -> integer_representable n.
Proof.

Lemma round_int_no_overflow:
  forall n,
  Z.abs n <= 2^emax - 2^(emax-prec) ->
  (Rabs (round radix2 fexp (round_mode mode_NE) (Z2R n)) < bpow radix2 emax)%R.
Proof.

Conversion from an integer. Round to nearest.

Definition BofZ (n: Z) : binary_float :=
  binary_normalize prec emax prec_gt_0_ Hmax mode_NE n 0 false.

Theorem BofZ_correct:
  forall n,
  if Rlt_bool (Rabs (round radix2 fexp (round_mode mode_NE) (Z2R n))) (bpow radix2 emax)
  then
    B2R prec emax (BofZ n) = round radix2 fexp (round_mode mode_NE) (Z2R n) /\
    is_finite _ _ (BofZ n) = true /\
    Bsign prec emax (BofZ n) = Zlt_bool n 0
  else
    B2FF prec emax (BofZ n) = binary_overflow prec emax mode_NE (Zlt_bool n 0).
Proof.

Theorem BofZ_finite:
  forall n,
  Z.abs n <= 2^emax - 2^(emax-prec) ->
  B2R _ _ (BofZ n) = round radix2 fexp (round_mode mode_NE) (Z2R n)
  /\ is_finite _ _ (BofZ n) = true
  /\ Bsign _ _ (BofZ n) = Zlt_bool n 0%Z.
Proof.

Theorem BofZ_representable:
  forall n,
  integer_representable n ->
  B2R _ _ (BofZ n) = Z2R n
  /\ is_finite _ _ (BofZ n) = true
  /\ Bsign _ _ (BofZ n) = (n <? 0).
Proof.

Theorem BofZ_exact:
  forall n,
  -2^prec <= n <= 2^prec ->
  B2R _ _ (BofZ n) = Z2R n
  /\ is_finite _ _ (BofZ n) = true
  /\ Bsign _ _ (BofZ n) = Zlt_bool n 0%Z.
Proof.

Lemma BofZ_finite_pos0:
  forall n,
  Z.abs n <= 2^emax - 2^(emax-prec) -> is_finite_pos0 (BofZ n) = true.
Proof.

Lemma BofZ_finite_equal:
  forall x y,
  Z.abs x <= 2^emax - 2^(emax-prec) ->
  Z.abs y <= 2^emax - 2^(emax-prec) ->
  B2R _ _ (BofZ x) = B2R _ _ (BofZ y) ->
  BofZ x = BofZ y.
Proof.

Commutation properties with addition, subtraction, multiplication.

Theorem BofZ_plus:
  forall nan p q,
  integer_representable p -> integer_representable q ->
  Bplus _ _ _ Hmax nan mode_NE (BofZ p) (BofZ q) = BofZ (p + q).
Proof.

Theorem BofZ_minus:
  forall nan p q,
  integer_representable p -> integer_representable q ->
  Bminus _ _ _ Hmax nan mode_NE (BofZ p) (BofZ q) = BofZ (p - q).
Proof.

Theorem BofZ_mult:
  forall nan p q,
  integer_representable p -> integer_representable q ->
  0 < q ->
  Bmult _ _ _ Hmax nan mode_NE (BofZ p) (BofZ q) = BofZ (p * q).
Proof.

Theorem BofZ_mult_2p:
  forall nan x p,
  Z.abs x <= 2^emax - 2^(emax-prec) ->
  2^prec <= Z.abs x ->
  0 <= p <= emax - 1 ->
  Bmult _ _ _ Hmax nan mode_NE (BofZ x) (BofZ (2^p)) = BofZ (x * 2^p).
Proof.

Rounding to odd the argument of BofZ.

Lemma round_odd_flt:
  forall prec' emin' x choice,
  prec > 1 -> prec' > 1 -> prec' >= prec + 2 -> emin' <= emin - 2 ->
  round radix2 fexp (Znearest choice) (round radix2 (FLT_exp emin' prec') Zrnd_odd x) =
  round radix2 fexp (Znearest choice) x.
Proof.

Corollary round_odd_fix:
  forall x p choice,
  prec > 1 ->
  0 <= p ->
  (bpow radix2 (prec + p + 1) <= Rabs x)%R ->
  round radix2 fexp (Znearest choice) (round radix2 (FIX_exp p) Zrnd_odd x) =
  round radix2 fexp (Znearest choice) x.
Proof.

Definition int_round_odd (x: Z) (p: Z) :=
  (if Z.eqb (x mod 2^p) 0 || Z.odd (x / 2^p) then x / 2^p else x / 2^p + 1) * 2^p.

Lemma Zrnd_odd_int:
  forall n p, 0 <= p ->
  Zrnd_odd (Z2R n * bpow radix2 (-p)) * 2^p =
  int_round_odd n p.
Proof.

Lemma int_round_odd_le:
  forall p x y, 0 <= p ->
  x <= y -> int_round_odd x p <= int_round_odd y p.
Proof.

Lemma int_round_odd_exact:
  forall p x, 0 <= p ->
  (2^p | x) -> int_round_odd x p = x.
Proof.

Theorem BofZ_round_odd:
  forall x p,
  prec > 1 ->
  Z.abs x <= 2^emax - 2^(emax-prec) ->
  0 <= p <= emax - prec ->
  2^(prec + p + 1) <= Z.abs x ->
  BofZ x = BofZ (int_round_odd x p).
Proof.

Lemma int_round_odd_shifts:
  forall x p, 0 <= p ->
  int_round_odd x p =
  Z.shiftl (if Z.eqb (x mod 2^p) 0 then Z.shiftr x p else Z.lor (Z.shiftr x p) 1) p.
Proof.

Lemma int_round_odd_bits:
  forall x y p, 0 <= p ->
  (forall i, 0 <= i < p -> Z.testbit y i = false) ->
  Z.testbit y p = (if Z.eqb (x mod 2^p) 0 then Z.testbit x p else true) ->
  (forall i, p < i -> Z.testbit y i = Z.testbit x i) ->
  int_round_odd x p = y.
Proof.

Conversion from a FP number to an integer


Always rounds toward zero.

Definition ZofB (f: binary_float): option Z :=
  match f with
    | B754_finite _ _ s m (Zpos e) _ => Some (cond_Zopp s (Zpos m) * Zpower_pos radix2 e)%Z
    | B754_finite _ _ s m 0 _ => Some (cond_Zopp s (Zpos m))
    | B754_finite _ _ s m (Zneg e) _ => Some (cond_Zopp s (Zpos m / Zpower_pos radix2 e))%Z
    | B754_zero _ _ _ => Some 0%Z
    | _ => None
  end.

Theorem ZofB_correct:
  forall f,
  ZofB f = if is_finite _ _ f then Some (Ztrunc (B2R _ _ f)) else None.
Proof.

Interval properties.

Remark Ztrunc_range_pos:
  forall x, 0 < Ztrunc x -> (Z2R (Ztrunc x) <= x < Z2R (Ztrunc x + 1)%Z)%R.
Proof.

Remark Ztrunc_range_zero:
  forall x, Ztrunc x = 0 -> (-1 < x < 1)%R.
Proof.

Theorem ZofB_range_pos:
  forall f n, ZofB f = Some n -> 0 < n -> (Z2R n <= B2R _ _ f < Z2R (n + 1)%Z)%R.
Proof.

Theorem ZofB_range_neg:
  forall f n, ZofB f = Some n -> n < 0 -> (Z2R (n - 1)%Z < B2R _ _ f <= Z2R n)%R.
Proof.

Theorem ZofB_range_zero:
  forall f, ZofB f = Some 0 -> (-1 < B2R _ _ f < 1)%R.
Proof.

Theorem ZofB_range_nonneg:
  forall f n, ZofB f = Some n -> 0 <= n -> (-1 < B2R _ _ f < Z2R (n + 1)%Z)%R.
Proof.

For representable integers, ZofB is left inverse of BofZ.

Theorem ZofBofZ_exact:
  forall n, integer_representable n -> ZofB (BofZ n) = Some n.
Proof.

Compatibility with subtraction

Remark Zfloor_minus:
  forall x n, Zfloor (x - Z2R n) = Zfloor x - n.
Proof.

Theorem ZofB_minus:
  forall minus_nan m f p q,
  ZofB f = Some p -> 0 <= p < 2*q -> q <= 2^prec -> (Z2R q <= B2R _ _ f)%R ->
  ZofB (Bminus _ _ _ Hmax minus_nan m f (BofZ q)) = Some (p - q).
Proof.

A variant of ZofB that bounds the range of representable integers.

Definition ZofB_range (f: binary_float) (zmin zmax: Z): option Z :=
  match ZofB f with
  | None => None
  | Some z => if Zle_bool zmin z && Zle_bool z zmax then Some z else None
  end.

Theorem ZofB_range_correct:
  forall f min max,
  let n := Ztrunc (B2R _ _ f) in
  ZofB_range f min max =
  if is_finite _ _ f && Zle_bool min n && Zle_bool n max then Some n else None.
Proof.

Lemma ZofB_range_inversion:
  forall f min max n,
  ZofB_range f min max = Some n ->
  min <= n /\ n <= max /\ ZofB f = Some n.
Proof.

Theorem ZofB_range_minus:
  forall minus_nan m f p q,
  ZofB_range f 0 (2 * q - 1) = Some p -> q <= 2^prec -> (Z2R q <= B2R _ _ f)%R ->
  ZofB_range (Bminus _ _ _ Hmax minus_nan m f (BofZ q)) (-q) (q - 1) = Some (p - q).
Proof.

Algebraic identities


Commutativity of addition and multiplication

Theorem Bplus_commut:
  forall plus_nan mode (x y: binary_float),
  plus_nan x y = plus_nan y x ->
  Bplus _ _ _ Hmax plus_nan mode x y = Bplus _ _ _ Hmax plus_nan mode y x.
Proof.

Theorem Bmult_commut:
  forall mult_nan mode (x y: binary_float),
  mult_nan x y = mult_nan y x ->
  Bmult _ _ _ Hmax mult_nan mode x y = Bmult _ _ _ Hmax mult_nan mode y x.
Proof.

Multiplication by 2 is diagonal addition.

Theorem Bmult2_Bplus:
  forall plus_nan mult_nan mode (f: binary_float),
  (forall (x y: binary_float),
   is_nan _ _ x = true -> is_finite _ _ y = true -> plus_nan x x = mult_nan x y) ->
  Bplus _ _ _ Hmax plus_nan mode f f = Bmult _ _ _ Hmax mult_nan mode f (BofZ 2%Z).
Proof.

Divisions that can be turned into multiplications by an inverse

Definition Bexact_inverse_mantissa := Z.iter (prec - 1) xO xH.

Remark Bexact_inverse_mantissa_value:
  Zpos Bexact_inverse_mantissa = 2 ^ (prec - 1).
Proof.

Remark Bexact_inverse_mantissa_digits2_pos:
  Z.pos (digits2_pos Bexact_inverse_mantissa) = prec.
Proof.

Remark bounded_Bexact_inverse:
  forall e,
  emin <= e <= emax - prec <-> bounded prec emax Bexact_inverse_mantissa e = true.
Proof.

Program Definition Bexact_inverse (f: binary_float) : option binary_float :=
  match f with
  | B754_finite _ _ s m e B =>
      if positive_eq_dec m Bexact_inverse_mantissa then
      let e' := -e - (prec - 1) * 2 in
      if Z_le_dec emin e' then
      if Z_le_dec e' emax then
        Some(B754_finite _ _ s m e' _)
      else None else None else None
  | _ => None
  end.
Next Obligation.

Lemma Bexact_inverse_correct:
  forall f f', Bexact_inverse f = Some f' ->
  is_finite_strict _ _ f = true
  /\ is_finite_strict _ _ f' = true
  /\ B2R _ _ f' = (/ B2R _ _ f)%R
  /\ B2R _ _ f <> 0%R
  /\ Bsign _ _ f' = Bsign _ _ f.
Proof with

Theorem Bdiv_mult_inverse:
  forall div_nan mult_nan mode x y z,
  (forall (x y z: binary_float),
   is_nan _ _ x = true -> is_finite _ _ y = true -> is_finite _ _ z = true ->
   div_nan x y = mult_nan x z) ->
  Bexact_inverse y = Some z ->
  Bdiv _ _ _ Hmax div_nan mode x y = Bmult _ _ _ Hmax mult_nan mode x z.
Proof.

Conversion from scientific notation


Russian peasant exponentiation

Fixpoint pos_pow (x y: positive) : positive :=
  match y with
  | xH => x
  | xO y => Pos.square (pos_pow x y)
  | xI y => Pos.mul x (Pos.square (pos_pow x y))
  end.

Lemma pos_pow_spec:
  forall x y, Z.pos (pos_pow x y) = Z.pos x ^ Z.pos y.
Proof.

Given a base base, a mantissa m and an exponent e, the following function computes the FP number closest to m * base ^ e, using round to odd, ties break to even. The algorithm is naive, computing base ^ |e| exactly before doing a multiplication or division with m. However, we treat specially very large or very small values of e, when the result is known to be +infinity or 0.0 respectively.

Definition Bparse (base: positive) (m: positive) (e: Z): binary_float :=
  match e with
  | Z0 =>
     BofZ (Zpos m)
  | Zpos p =>
     if e * Z.log2 (Zpos base) <? emax
     then BofZ (Zpos m * Zpos (pos_pow base p))
     else B754_infinity _ _ false
  | Zneg p =>
     if e * Z.log2 (Zpos base) + Z.log2_up (Zpos m) <? emin
     then B754_zero _ _ false
     else FF2B prec emax _ (proj1 (Bdiv_correct_aux prec emax prec_gt_0_ Hmax mode_NE
                                     false m Z0 false (pos_pow base p) Z0))
  end.

Properties of Z.log2 and Z.log2_up.

Lemma Zpower_log:
  forall (base: radix) n,
  0 < n ->
  2 ^ (n * Z.log2 base) <= base ^ n <= 2 ^ (n * Z.log2_up base).
Proof.

Lemma bpow_log_pos:
  forall (base: radix) n,
  0 < n ->
  (bpow radix2 (n * Z.log2 base)%Z <= bpow base n)%R.
Proof.

Lemma bpow_log_neg:
  forall (base: radix) n,
  n < 0 ->
  (bpow base n <= bpow radix2 (n * Z.log2 base)%Z)%R.
Proof.

Overflow and underflow conditions.

Lemma round_integer_overflow:
  forall (base: radix) e m,
  0 < e ->
  emax <= e * Z.log2 base ->
  (bpow radix2 emax <= round radix2 fexp (round_mode mode_NE) (Z2R (Zpos m) * bpow base e))%R.
Proof.

Lemma round_NE_underflows:
  forall x,
  (0 <= x <= bpow radix2 (emin - 1))%R ->
  round radix2 fexp (round_mode mode_NE) x = 0%R.
Proof.

Lemma round_integer_underflow:
  forall (base: radix) e m,
  e < 0 ->
  e * Z.log2 base + Z.log2_up (Zpos m) < emin ->
  round radix2 fexp (round_mode mode_NE) (Z2R (Zpos m) * bpow base e) = 0%R.
Proof.

Correctness of Bparse

Theorem Bparse_correct:
  forall b m e (BASE: 2 <= Zpos b),
  let base := {| radix_val := Zpos b; radix_prop := Zle_imp_le_bool _ _ BASE |} in
  let r := round radix2 fexp (round_mode mode_NE) (Z2R (Zpos m) * bpow base e) in
  if Rlt_bool (Rabs r) (bpow radix2 emax) then
     B2R _ _ (Bparse b m e) = r
  /\ is_finite _ _ (Bparse b m e) = true
  /\ Bsign _ _ (Bparse b m e) = false
  else
    B2FF _ _ (Bparse b m e) = F754_infinity false.
Proof.

End Extra_ops.

Conversions between two FP formats


Section Conversions.

Variable prec1 emax1 prec2 emax2 : Z.
Context (prec1_gt_0_ : Prec_gt_0 prec1) (prec2_gt_0_ : Prec_gt_0 prec2).
Let emin1 := (3 - emax1 - prec1)%Z.
Let fexp1 := FLT_exp emin1 prec1.
Let emin2 := (3 - emax2 - prec2)%Z.
Let fexp2 := FLT_exp emin2 prec2.
Hypothesis Hmax1 : (prec1 < emax1)%Z.
Hypothesis Hmax2 : (prec2 < emax2)%Z.
Let binary_float1 := binary_float prec1 emax1.
Let binary_float2 := binary_float prec2 emax2.

Definition Bconv (conv_nan: bool -> nan_pl prec1 -> bool * nan_pl prec2) (md: mode) (f: binary_float1) : binary_float2 :=
  match f with
    | B754_nan _ _ s pl => let '(s, pl) := conv_nan s pl in B754_nan _ _ s pl
    | B754_infinity _ _ s => B754_infinity _ _ s
    | B754_zero _ _ s => B754_zero _ _ s
    | B754_finite _ _ s m e _ => binary_normalize _ _ _ Hmax2 md (cond_Zopp s (Zpos m)) e s
  end.

Theorem Bconv_correct:
  forall conv_nan m f,
  is_finite _ _ f = true ->
  if Rlt_bool (Rabs (round radix2 fexp2 (round_mode m) (B2R _ _ f))) (bpow radix2 emax2)
  then
     B2R _ _ (Bconv conv_nan m f) = round radix2 fexp2 (round_mode m) (B2R _ _ f)
  /\ is_finite _ _ (Bconv conv_nan m f) = true
  /\ Bsign _ _ (Bconv conv_nan m f) = Bsign _ _ f
  else
     B2FF _ _ (Bconv conv_nan m f) = binary_overflow prec2 emax2 m (Bsign _ _ f).
Proof.

Converting a finite FP number to higher or equal precision preserves its value.

Theorem Bconv_widen_exact:
  (prec2 >= prec1)%Z -> (emax2 >= emax1)%Z ->
  forall conv_nan m f,
  is_finite _ _ f = true ->
     B2R _ _ (Bconv conv_nan m f) = B2R _ _ f
  /\ is_finite _ _ (Bconv conv_nan m f) = true
  /\ Bsign _ _ (Bconv conv_nan m f) = Bsign _ _ f.
Proof.

Conversion from integers and change of format

Theorem Bconv_BofZ:
  forall conv_nan n,
  integer_representable prec1 emax1 n ->
  Bconv conv_nan mode_NE (BofZ prec1 emax1 _ Hmax1 n) = BofZ prec2 emax2 _ Hmax2 n.
Proof.

Change of format (to higher precision) and conversion to integer.

Theorem ZofB_Bconv:
  prec2 >= prec1 -> emax2 >= emax1 ->
  forall conv_nan m f n,
  ZofB _ _ f = Some n -> ZofB _ _ (Bconv conv_nan m f) = Some n.
Proof.

Theorem ZofB_range_Bconv:
  forall min1 max1 min2 max2,
  prec2 >= prec1 -> emax2 >= emax1 -> min2 <= min1 -> max1 <= max2 ->
  forall conv_nan m f n,
  ZofB_range _ _ f min1 max1 = Some n ->
  ZofB_range _ _ (Bconv conv_nan m f) min2 max2 = Some n.
Proof.

Change of format (to higher precision) and comparison.

Theorem Bcompare_Bconv_widen:
  prec2 >= prec1 -> emax2 >= emax1 ->
  forall conv_nan m x y,
  Bcompare _ _ (Bconv conv_nan m x) (Bconv conv_nan m y) = Bcompare _ _ x y.
Proof.

End Conversions.

Section Compose_Conversions.

Variable prec1 emax1 prec2 emax2 : Z.
Context (prec1_gt_0_ : Prec_gt_0 prec1) (prec2_gt_0_ : Prec_gt_0 prec2).
Let emin1 := (3 - emax1 - prec1)%Z.
Let fexp1 := FLT_exp emin1 prec1.
Let emin2 := (3 - emax2 - prec2)%Z.
Let fexp2 := FLT_exp emin2 prec2.
Hypothesis Hmax1 : (prec1 < emax1)%Z.
Hypothesis Hmax2 : (prec2 < emax2)%Z.
Let binary_float1 := binary_float prec1 emax1.
Let binary_float2 := binary_float prec2 emax2.

Converting to a higher precision then down to the original format is the identity.
Theorem Bconv_narrow_widen:
  prec2 >= prec1 -> emax2 >= emax1 ->
  forall narrow_nan widen_nan m f,
  is_nan _ _ f = false ->
  Bconv prec2 emax2 prec1 emax1 _ Hmax1 narrow_nan m (Bconv prec1 emax1 prec2 emax2 _ Hmax2 widen_nan m f) = f.
Proof.

End Compose_Conversions.