fp_trunc_impl.inc - kit

fp_trunc_impl.inc (14405B)
      1 //= lib/fp_trunc_impl.inc - high precision -> low precision conversion *-*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements a fairly generic conversion from a wider to a narrower
     10 // IEEE-754 floating-point type in the default (round to nearest, ties to even)
     11 // rounding mode.  The constants and types defined following the includes below
     12 // parameterize the conversion.
     13 //
     14 // This routine can be trivially adapted to support conversions to
     15 // half-precision or from quad-precision. It does not support types that don't
     16 // use the usual IEEE-754 interchange formats; specifically, some work would be
     17 // needed to adapt it to (for example) the Intel 80-bit format or PowerPC
     18 // double-double format.
     19 //
     20 // Note please, however, that this implementation is only intended to support
     21 // *narrowing* operations; if you need to convert to a *wider* floating-point
     22 // type (e.g. float -> double), then this routine will not do what you want it
     23 // to.
     24 //
     25 // It also requires that integer types at least as large as both formats
     26 // are available on the target platform; this may pose a problem when trying
     27 // to add support for quad on some 32-bit systems, for example.
     28 //
     29 // Finally, the following assumptions are made:
     30 //
     31 // 1. Floating-point types and integer types have the same endianness on the
     32 //    target platform.
     33 //
     34 // 2. Quiet NaNs, if supported, are indicated by the leading bit of the
     35 //    significand field being set.
     36 //
     37 //===----------------------------------------------------------------------===//
     38 
     39 // ---- fp_trunc.h (was a separate header; merged) ----
     40 #include "int_lib.h"
     41 
     42 // Self-clean any prior inclusion's per-(src, dst) state. May coexist with
     43 // fp_extend.h / int_to_fp.h which define overlapping bare-name aliases.
     44 #undef _FP_TRUNC_SRC_SUF
     45 #undef _FP_TRUNC_DST_SUF
     46 #undef SRC_REP_C
     47 #undef DST_REP_C
     48 #undef src_t
     49 #undef src_rep_t
     50 #undef dst_t
     51 #undef dst_rep_t
     52 #undef srcBits
     53 #undef srcSigFracBits
     54 #undef srcExpBits
     55 #undef dstBits
     56 #undef dstSigFracBits
     57 #undef dstExpBits
     58 #undef srcToRep
     59 #undef dstFromRep
     60 #undef extract_sign_from_src
     61 #undef extract_exp_from_src
     62 #undef extract_sig_frac_from_src
     63 #undef construct_dst_rep
     64 
     65 #if defined SRC_SINGLE
     66 #define _FP_TRUNC_SRC_SUF sf
     67 #elif defined SRC_DOUBLE
     68 #define _FP_TRUNC_SRC_SUF df
     69 #elif defined SRC_QUAD
     70 #define _FP_TRUNC_SRC_SUF tf
     71 #else
     72 #error Source should be double precision or quad precision!
     73 #endif
     74 
     75 #if defined DST_SINGLE
     76 #define _FP_TRUNC_DST_SUF sf
     77 #elif defined DST_DOUBLE
     78 #define _FP_TRUNC_DST_SUF df
     79 #elif defined DST_80
     80 #define _FP_TRUNC_DST_SUF xf
     81 #elif defined DST_HALF
     82 #define _FP_TRUNC_DST_SUF hf
     83 #elif defined DST_BFLOAT
     84 #define _FP_TRUNC_DST_SUF bf
     85 #else
     86 #error Destination should be single precision or double precision!
     87 #endif
     88 
     89 #define _FP_TRUNC_PASTE4_(a, b, c, d) a##b##c##d
     90 #define _FP_TRUNC_PASTE4(a, b, c, d)  _FP_TRUNC_PASTE4_(a, b, c, d)
     91 #define _FP_TRUNC_PAIR(stem)          _FP_TRUNC_PASTE4(stem, _, _FP_TRUNC_SRC_SUF, _FP_TRUNC_DST_SUF)
     92 
     93 // ---- Bare-name aliases (re-set every inclusion). ------------------------
     94 // fp_extend.h uses identical bare names; the aliases here suffix-rename
     95 // to fp_trunc-specific symbols (different pair tokens), so the two
     96 // headers can coexist in one TU without colliding.
     97 
     98 #define src_t                       _FP_TRUNC_PAIR(src_t)
     99 #define src_rep_t                   _FP_TRUNC_PAIR(src_rep_t)
    100 #define dst_t                       _FP_TRUNC_PAIR(dst_t)
    101 #define dst_rep_t                   _FP_TRUNC_PAIR(dst_rep_t)
    102 #define srcBits                     _FP_TRUNC_PAIR(srcBits)
    103 #define srcSigFracBits              _FP_TRUNC_PAIR(srcSigFracBits)
    104 #define srcExpBits                  _FP_TRUNC_PAIR(srcExpBits)
    105 #define dstBits                     _FP_TRUNC_PAIR(dstBits)
    106 #define dstSigFracBits              _FP_TRUNC_PAIR(dstSigFracBits)
    107 #define dstExpBits                  _FP_TRUNC_PAIR(dstExpBits)
    108 #define srcToRep                    _FP_TRUNC_PAIR(srcToRep)
    109 #define dstFromRep                  _FP_TRUNC_PAIR(dstFromRep)
    110 #define extract_sign_from_src       _FP_TRUNC_PAIR(extract_sign_from_src)
    111 #define extract_exp_from_src        _FP_TRUNC_PAIR(extract_exp_from_src)
    112 #define extract_sig_frac_from_src   _FP_TRUNC_PAIR(extract_sig_frac_from_src)
    113 #define construct_dst_rep           _FP_TRUNC_PAIR(construct_dst_rep)
    114 
    115 #if defined SRC_SINGLE
    116 #define SRC_REP_C UINT32_C
    117 #elif defined SRC_DOUBLE
    118 #define SRC_REP_C UINT64_C
    119 #elif defined SRC_QUAD
    120 #define SRC_REP_C (__uint128_t)
    121 #endif
    122 
    123 #if defined DST_SINGLE
    124 #define DST_REP_C UINT32_C
    125 #elif defined DST_DOUBLE
    126 #define DST_REP_C UINT64_C
    127 #elif defined DST_80
    128 #define DST_REP_C (__uint128_t)
    129 #elif defined DST_HALF
    130 #define DST_REP_C UINT16_C
    131 #elif defined DST_BFLOAT
    132 #define DST_REP_C UINT16_C
    133 #endif
    134 
    135 // ---- One-time emission per (TU, src+dst pair). --------------------------
    136 // Pairs kit uses: df→sf, tf→df, tf→sf.
    137 
    138 #if defined SRC_DOUBLE && defined DST_SINGLE && !defined FP_TRUNC_DFSF_EMITTED
    139 #define FP_TRUNC_DFSF_EMITTED
    140 #define _FP_TRUNC_EMIT 1
    141 #elif defined SRC_QUAD && defined DST_DOUBLE && !defined FP_TRUNC_TFDF_EMITTED
    142 #define FP_TRUNC_TFDF_EMITTED
    143 #define _FP_TRUNC_EMIT 1
    144 #elif defined SRC_QUAD && defined DST_SINGLE && !defined FP_TRUNC_TFSF_EMITTED
    145 #define FP_TRUNC_TFSF_EMITTED
    146 #define _FP_TRUNC_EMIT 1
    147 #endif
    148 
    149 #ifdef _FP_TRUNC_EMIT
    150 #undef _FP_TRUNC_EMIT
    151 
    152 #if defined SRC_SINGLE
    153 typedef float src_t;
    154 typedef uint32_t src_rep_t;
    155 static const int srcBits = sizeof(src_t) * CHAR_BIT;
    156 static const int srcSigFracBits = 23;
    157 // -1 accounts for the sign bit.
    158 // srcBits - srcSigFracBits - 1
    159 static const int srcExpBits = 8;
    160 
    161 #elif defined SRC_DOUBLE
    162 typedef double src_t;
    163 typedef uint64_t src_rep_t;
    164 static const int srcBits = sizeof(src_t) * CHAR_BIT;
    165 static const int srcSigFracBits = 52;
    166 // -1 accounts for the sign bit.
    167 // srcBits - srcSigFracBits - 1
    168 static const int srcExpBits = 11;
    169 
    170 #elif defined SRC_QUAD
    171 typedef tf_float src_t;
    172 typedef __uint128_t src_rep_t;
    173 static const int srcBits = sizeof(src_t) * CHAR_BIT;
    174 static const int srcSigFracBits = 112;
    175 // -1 accounts for the sign bit.
    176 // srcBits - srcSigFracBits - 1
    177 static const int srcExpBits = 15;
    178 
    179 #endif // end source precision
    180 
    181 #if defined DST_DOUBLE
    182 typedef double dst_t;
    183 typedef uint64_t dst_rep_t;
    184 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
    185 static const int dstSigFracBits = 52;
    186 // -1 accounts for the sign bit.
    187 // dstBits - dstSigFracBits - 1
    188 static const int dstExpBits = 11;
    189 
    190 #elif defined DST_80
    191 typedef xf_float dst_t;
    192 typedef __uint128_t dst_rep_t;
    193 static const int dstBits = 80;
    194 static const int dstSigFracBits = 63;
    195 // -1 accounts for the sign bit.
    196 // -1 accounts for the explicitly stored integer bit.
    197 // dstBits - dstSigFracBits - 1 - 1
    198 static const int dstExpBits = 15;
    199 
    200 #elif defined DST_SINGLE
    201 typedef float dst_t;
    202 typedef uint32_t dst_rep_t;
    203 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
    204 static const int dstSigFracBits = 23;
    205 // -1 accounts for the sign bit.
    206 // dstBits - dstSigFracBits - 1
    207 static const int dstExpBits = 8;
    208 
    209 #elif defined DST_HALF
    210 #ifdef COMPILER_RT_HAS_FLOAT16
    211 typedef _Float16 dst_t;
    212 #else
    213 typedef uint16_t dst_t;
    214 #endif
    215 typedef uint16_t dst_rep_t;
    216 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
    217 static const int dstSigFracBits = 10;
    218 // -1 accounts for the sign bit.
    219 // dstBits - dstSigFracBits - 1
    220 static const int dstExpBits = 5;
    221 
    222 #elif defined DST_BFLOAT
    223 typedef __bf16 dst_t;
    224 typedef uint16_t dst_rep_t;
    225 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
    226 static const int dstSigFracBits = 7;
    227 // -1 accounts for the sign bit.
    228 // dstBits - dstSigFracBits - 1
    229 static const int dstExpBits = 8;
    230 
    231 #endif // end destination precision
    232 
    233 // TODO: These helper routines should be placed into fp_lib.h
    234 // Currently they depend on macros/constants defined above.
    235 
    236 static inline src_rep_t extract_sign_from_src(src_rep_t x) {
    237   const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1);
    238   return (x & srcSignMask) >> (srcBits - 1);
    239 }
    240 
    241 static inline src_rep_t extract_exp_from_src(src_rep_t x) {
    242   const int srcSigBits = srcBits - 1 - srcExpBits;
    243   const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits;
    244   return (x & srcExpMask) >> srcSigBits;
    245 }
    246 
    247 static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
    248   const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1;
    249   return x & srcSigFracMask;
    250 }
    251 
    252 static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) {
    253   dst_rep_t result = (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac;
    254   // Set the explicit integer bit in F80 if present.
    255   if (dstBits == 80 && exp) {
    256     result |= (DST_REP_C(1) << dstSigFracBits);
    257   }
    258   return result;
    259 }
    260 
    261 // End of specialization parameters.  Two helper routines for conversion to and
    262 // from the representation of floating-point data as integer values follow.
    263 
    264 static inline src_rep_t srcToRep(src_t x) {
    265   const union {
    266     src_t f;
    267     src_rep_t i;
    268   } rep = {.f = x};
    269   return rep.i;
    270 }
    271 
    272 static inline dst_t dstFromRep(dst_rep_t x) {
    273   const union {
    274     dst_t f;
    275     dst_rep_t i;
    276   } rep = {.i = x};
    277   return rep.f;
    278 }
    279 
    280 #endif // _FP_TRUNC_EMIT
    281 
    282 #define __truncXfYf2__ _FP_TRUNC_PAIR(__truncXfYf2__)
    283 
    284 #if defined SRC_DOUBLE && defined DST_SINGLE && !defined FP_TRUNC_IMPL_DFSF_EMITTED
    285 #define FP_TRUNC_IMPL_DFSF_EMITTED
    286 #define _FP_TRUNC_IMPL_EMIT 1
    287 #elif defined SRC_QUAD && defined DST_DOUBLE && !defined FP_TRUNC_IMPL_TFDF_EMITTED
    288 #define FP_TRUNC_IMPL_TFDF_EMITTED
    289 #define _FP_TRUNC_IMPL_EMIT 1
    290 #elif defined SRC_QUAD && defined DST_SINGLE && !defined FP_TRUNC_IMPL_TFSF_EMITTED
    291 #define FP_TRUNC_IMPL_TFSF_EMITTED
    292 #define _FP_TRUNC_IMPL_EMIT 1
    293 #endif
    294 
    295 #ifdef _FP_TRUNC_IMPL_EMIT
    296 #undef _FP_TRUNC_IMPL_EMIT
    297 
    298 // The destination type may use a usual IEEE-754 interchange format or Intel
    299 // 80-bit format. In particular, for the destination type dstSigFracBits may be
    300 // not equal to dstSigBits. The source type is assumed to be one of IEEE-754
    301 // standard types.
    302 static inline dst_t __truncXfYf2__(src_t a) {
    303   // Various constants whose values follow from the type parameters.
    304   // Any reasonable optimizer will fold and propagate all of these.
    305   const int srcInfExp = (1 << srcExpBits) - 1;
    306   const int srcExpBias = srcInfExp >> 1;
    307 
    308   const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigFracBits;
    309   const src_rep_t roundMask =
    310       (SRC_REP_C(1) << (srcSigFracBits - dstSigFracBits)) - 1;
    311   const src_rep_t halfway = SRC_REP_C(1)
    312                             << (srcSigFracBits - dstSigFracBits - 1);
    313   const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigFracBits - 1);
    314   const src_rep_t srcNaNCode = srcQNaN - 1;
    315 
    316   const int dstInfExp = (1 << dstExpBits) - 1;
    317   const int dstExpBias = dstInfExp >> 1;
    318   const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
    319 
    320   const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigFracBits - 1);
    321   const dst_rep_t dstNaNCode = dstQNaN - 1;
    322 
    323   const src_rep_t aRep = srcToRep(a);
    324   const src_rep_t srcSign = extract_sign_from_src(aRep);
    325   const src_rep_t srcExp = extract_exp_from_src(aRep);
    326   const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep);
    327 
    328   dst_rep_t dstSign = srcSign;
    329   dst_rep_t dstExp;
    330   dst_rep_t dstSigFrac;
    331 
    332   // Same size exponents and a's significand tail is 0.
    333   // The significand can be truncated and the exponent can be copied over.
    334   const int sigFracTailBits = srcSigFracBits - dstSigFracBits;
    335   if (srcExpBits == dstExpBits &&
    336       ((aRep >> sigFracTailBits) << sigFracTailBits) == aRep) {
    337     dstExp = srcExp;
    338     dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
    339     return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
    340   }
    341 
    342   const int dstExpCandidate = ((int)srcExp - srcExpBias) + dstExpBias;
    343   if (dstExpCandidate >= 1 && dstExpCandidate < dstInfExp) {
    344     // The exponent of a is within the range of normal numbers in the
    345     // destination format. We can convert by simply right-shifting with
    346     // rounding and adjusting the exponent.
    347     dstExp = dstExpCandidate;
    348     dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
    349 
    350     const src_rep_t roundBits = srcSigFrac & roundMask;
    351     // Round to nearest.
    352     if (roundBits > halfway)
    353       dstSigFrac++;
    354     // Tie to even.
    355     else if (roundBits == halfway)
    356       dstSigFrac += dstSigFrac & 1;
    357 
    358     // Rounding has changed the exponent.
    359     if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
    360       dstExp += 1;
    361       dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
    362     }
    363   } else if (srcExp == srcInfExp && srcSigFrac) {
    364     // a is NaN.
    365     // Conjure the result by beginning with infinity, setting the qNaN
    366     // bit and inserting the (truncated) trailing NaN field.
    367     dstExp = dstInfExp;
    368     dstSigFrac = dstQNaN;
    369     dstSigFrac |= ((srcSigFrac & srcNaNCode) >> sigFracTailBits) & dstNaNCode;
    370   } else if ((int)srcExp >= overflowExponent) {
    371     dstExp = dstInfExp;
    372     dstSigFrac = 0;
    373   } else {
    374     // a underflows on conversion to the destination type or is an exact
    375     // zero.  The result may be a denormal or zero.  Extract the exponent
    376     // to get the shift amount for the denormalization.
    377     src_rep_t significand = srcSigFrac;
    378     int shift = srcExpBias - dstExpBias - srcExp;
    379 
    380     if (srcExp) {
    381       // Set the implicit integer bit if the source is a normal number.
    382       significand |= srcMinNormal;
    383       shift += 1;
    384     }
    385 
    386     // Right shift by the denormalization amount with sticky.
    387     if (shift > srcSigFracBits) {
    388       dstExp = 0;
    389       dstSigFrac = 0;
    390     } else {
    391       dstExp = 0;
    392       const bool sticky = shift && ((significand << (srcBits - shift)) != 0);
    393       src_rep_t denormalizedSignificand = significand >> shift | sticky;
    394       dstSigFrac = denormalizedSignificand >> sigFracTailBits;
    395       const src_rep_t roundBits = denormalizedSignificand & roundMask;
    396       // Round to nearest
    397       if (roundBits > halfway)
    398         dstSigFrac++;
    399       // Ties to even
    400       else if (roundBits == halfway)
    401         dstSigFrac += dstSigFrac & 1;
    402 
    403       // Rounding has changed the exponent.
    404       if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
    405         dstExp += 1;
    406         dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
    407       }
    408     }
    409   }
    410 
    411   return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
    412 }
    413 
    414 #endif // _FP_TRUNC_IMPL_EMIT
	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README