fp_trunc_impl.inc (14405B)
1 //= lib/fp_trunc_impl.inc - high precision -> low precision conversion *-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a fairly generic conversion from a wider to a narrower 10 // IEEE-754 floating-point type in the default (round to nearest, ties to even) 11 // rounding mode. The constants and types defined following the includes below 12 // parameterize the conversion. 13 // 14 // This routine can be trivially adapted to support conversions to 15 // half-precision or from quad-precision. It does not support types that don't 16 // use the usual IEEE-754 interchange formats; specifically, some work would be 17 // needed to adapt it to (for example) the Intel 80-bit format or PowerPC 18 // double-double format. 19 // 20 // Note please, however, that this implementation is only intended to support 21 // *narrowing* operations; if you need to convert to a *wider* floating-point 22 // type (e.g. float -> double), then this routine will not do what you want it 23 // to. 24 // 25 // It also requires that integer types at least as large as both formats 26 // are available on the target platform; this may pose a problem when trying 27 // to add support for quad on some 32-bit systems, for example. 28 // 29 // Finally, the following assumptions are made: 30 // 31 // 1. Floating-point types and integer types have the same endianness on the 32 // target platform. 33 // 34 // 2. Quiet NaNs, if supported, are indicated by the leading bit of the 35 // significand field being set. 36 // 37 //===----------------------------------------------------------------------===// 38 39 // ---- fp_trunc.h (was a separate header; merged) ---- 40 #include "int_lib.h" 41 42 // Self-clean any prior inclusion's per-(src, dst) state. May coexist with 43 // fp_extend.h / int_to_fp.h which define overlapping bare-name aliases. 44 #undef _FP_TRUNC_SRC_SUF 45 #undef _FP_TRUNC_DST_SUF 46 #undef SRC_REP_C 47 #undef DST_REP_C 48 #undef src_t 49 #undef src_rep_t 50 #undef dst_t 51 #undef dst_rep_t 52 #undef srcBits 53 #undef srcSigFracBits 54 #undef srcExpBits 55 #undef dstBits 56 #undef dstSigFracBits 57 #undef dstExpBits 58 #undef srcToRep 59 #undef dstFromRep 60 #undef extract_sign_from_src 61 #undef extract_exp_from_src 62 #undef extract_sig_frac_from_src 63 #undef construct_dst_rep 64 65 #if defined SRC_SINGLE 66 #define _FP_TRUNC_SRC_SUF sf 67 #elif defined SRC_DOUBLE 68 #define _FP_TRUNC_SRC_SUF df 69 #elif defined SRC_QUAD 70 #define _FP_TRUNC_SRC_SUF tf 71 #else 72 #error Source should be double precision or quad precision! 73 #endif 74 75 #if defined DST_SINGLE 76 #define _FP_TRUNC_DST_SUF sf 77 #elif defined DST_DOUBLE 78 #define _FP_TRUNC_DST_SUF df 79 #elif defined DST_80 80 #define _FP_TRUNC_DST_SUF xf 81 #elif defined DST_HALF 82 #define _FP_TRUNC_DST_SUF hf 83 #elif defined DST_BFLOAT 84 #define _FP_TRUNC_DST_SUF bf 85 #else 86 #error Destination should be single precision or double precision! 87 #endif 88 89 #define _FP_TRUNC_PASTE4_(a, b, c, d) a##b##c##d 90 #define _FP_TRUNC_PASTE4(a, b, c, d) _FP_TRUNC_PASTE4_(a, b, c, d) 91 #define _FP_TRUNC_PAIR(stem) _FP_TRUNC_PASTE4(stem, _, _FP_TRUNC_SRC_SUF, _FP_TRUNC_DST_SUF) 92 93 // ---- Bare-name aliases (re-set every inclusion). ------------------------ 94 // fp_extend.h uses identical bare names; the aliases here suffix-rename 95 // to fp_trunc-specific symbols (different pair tokens), so the two 96 // headers can coexist in one TU without colliding. 97 98 #define src_t _FP_TRUNC_PAIR(src_t) 99 #define src_rep_t _FP_TRUNC_PAIR(src_rep_t) 100 #define dst_t _FP_TRUNC_PAIR(dst_t) 101 #define dst_rep_t _FP_TRUNC_PAIR(dst_rep_t) 102 #define srcBits _FP_TRUNC_PAIR(srcBits) 103 #define srcSigFracBits _FP_TRUNC_PAIR(srcSigFracBits) 104 #define srcExpBits _FP_TRUNC_PAIR(srcExpBits) 105 #define dstBits _FP_TRUNC_PAIR(dstBits) 106 #define dstSigFracBits _FP_TRUNC_PAIR(dstSigFracBits) 107 #define dstExpBits _FP_TRUNC_PAIR(dstExpBits) 108 #define srcToRep _FP_TRUNC_PAIR(srcToRep) 109 #define dstFromRep _FP_TRUNC_PAIR(dstFromRep) 110 #define extract_sign_from_src _FP_TRUNC_PAIR(extract_sign_from_src) 111 #define extract_exp_from_src _FP_TRUNC_PAIR(extract_exp_from_src) 112 #define extract_sig_frac_from_src _FP_TRUNC_PAIR(extract_sig_frac_from_src) 113 #define construct_dst_rep _FP_TRUNC_PAIR(construct_dst_rep) 114 115 #if defined SRC_SINGLE 116 #define SRC_REP_C UINT32_C 117 #elif defined SRC_DOUBLE 118 #define SRC_REP_C UINT64_C 119 #elif defined SRC_QUAD 120 #define SRC_REP_C (__uint128_t) 121 #endif 122 123 #if defined DST_SINGLE 124 #define DST_REP_C UINT32_C 125 #elif defined DST_DOUBLE 126 #define DST_REP_C UINT64_C 127 #elif defined DST_80 128 #define DST_REP_C (__uint128_t) 129 #elif defined DST_HALF 130 #define DST_REP_C UINT16_C 131 #elif defined DST_BFLOAT 132 #define DST_REP_C UINT16_C 133 #endif 134 135 // ---- One-time emission per (TU, src+dst pair). -------------------------- 136 // Pairs kit uses: df→sf, tf→df, tf→sf. 137 138 #if defined SRC_DOUBLE && defined DST_SINGLE && !defined FP_TRUNC_DFSF_EMITTED 139 #define FP_TRUNC_DFSF_EMITTED 140 #define _FP_TRUNC_EMIT 1 141 #elif defined SRC_QUAD && defined DST_DOUBLE && !defined FP_TRUNC_TFDF_EMITTED 142 #define FP_TRUNC_TFDF_EMITTED 143 #define _FP_TRUNC_EMIT 1 144 #elif defined SRC_QUAD && defined DST_SINGLE && !defined FP_TRUNC_TFSF_EMITTED 145 #define FP_TRUNC_TFSF_EMITTED 146 #define _FP_TRUNC_EMIT 1 147 #endif 148 149 #ifdef _FP_TRUNC_EMIT 150 #undef _FP_TRUNC_EMIT 151 152 #if defined SRC_SINGLE 153 typedef float src_t; 154 typedef uint32_t src_rep_t; 155 static const int srcBits = sizeof(src_t) * CHAR_BIT; 156 static const int srcSigFracBits = 23; 157 // -1 accounts for the sign bit. 158 // srcBits - srcSigFracBits - 1 159 static const int srcExpBits = 8; 160 161 #elif defined SRC_DOUBLE 162 typedef double src_t; 163 typedef uint64_t src_rep_t; 164 static const int srcBits = sizeof(src_t) * CHAR_BIT; 165 static const int srcSigFracBits = 52; 166 // -1 accounts for the sign bit. 167 // srcBits - srcSigFracBits - 1 168 static const int srcExpBits = 11; 169 170 #elif defined SRC_QUAD 171 typedef tf_float src_t; 172 typedef __uint128_t src_rep_t; 173 static const int srcBits = sizeof(src_t) * CHAR_BIT; 174 static const int srcSigFracBits = 112; 175 // -1 accounts for the sign bit. 176 // srcBits - srcSigFracBits - 1 177 static const int srcExpBits = 15; 178 179 #endif // end source precision 180 181 #if defined DST_DOUBLE 182 typedef double dst_t; 183 typedef uint64_t dst_rep_t; 184 static const int dstBits = sizeof(dst_t) * CHAR_BIT; 185 static const int dstSigFracBits = 52; 186 // -1 accounts for the sign bit. 187 // dstBits - dstSigFracBits - 1 188 static const int dstExpBits = 11; 189 190 #elif defined DST_80 191 typedef xf_float dst_t; 192 typedef __uint128_t dst_rep_t; 193 static const int dstBits = 80; 194 static const int dstSigFracBits = 63; 195 // -1 accounts for the sign bit. 196 // -1 accounts for the explicitly stored integer bit. 197 // dstBits - dstSigFracBits - 1 - 1 198 static const int dstExpBits = 15; 199 200 #elif defined DST_SINGLE 201 typedef float dst_t; 202 typedef uint32_t dst_rep_t; 203 static const int dstBits = sizeof(dst_t) * CHAR_BIT; 204 static const int dstSigFracBits = 23; 205 // -1 accounts for the sign bit. 206 // dstBits - dstSigFracBits - 1 207 static const int dstExpBits = 8; 208 209 #elif defined DST_HALF 210 #ifdef COMPILER_RT_HAS_FLOAT16 211 typedef _Float16 dst_t; 212 #else 213 typedef uint16_t dst_t; 214 #endif 215 typedef uint16_t dst_rep_t; 216 static const int dstBits = sizeof(dst_t) * CHAR_BIT; 217 static const int dstSigFracBits = 10; 218 // -1 accounts for the sign bit. 219 // dstBits - dstSigFracBits - 1 220 static const int dstExpBits = 5; 221 222 #elif defined DST_BFLOAT 223 typedef __bf16 dst_t; 224 typedef uint16_t dst_rep_t; 225 static const int dstBits = sizeof(dst_t) * CHAR_BIT; 226 static const int dstSigFracBits = 7; 227 // -1 accounts for the sign bit. 228 // dstBits - dstSigFracBits - 1 229 static const int dstExpBits = 8; 230 231 #endif // end destination precision 232 233 // TODO: These helper routines should be placed into fp_lib.h 234 // Currently they depend on macros/constants defined above. 235 236 static inline src_rep_t extract_sign_from_src(src_rep_t x) { 237 const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1); 238 return (x & srcSignMask) >> (srcBits - 1); 239 } 240 241 static inline src_rep_t extract_exp_from_src(src_rep_t x) { 242 const int srcSigBits = srcBits - 1 - srcExpBits; 243 const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits; 244 return (x & srcExpMask) >> srcSigBits; 245 } 246 247 static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) { 248 const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1; 249 return x & srcSigFracMask; 250 } 251 252 static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) { 253 dst_rep_t result = (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac; 254 // Set the explicit integer bit in F80 if present. 255 if (dstBits == 80 && exp) { 256 result |= (DST_REP_C(1) << dstSigFracBits); 257 } 258 return result; 259 } 260 261 // End of specialization parameters. Two helper routines for conversion to and 262 // from the representation of floating-point data as integer values follow. 263 264 static inline src_rep_t srcToRep(src_t x) { 265 const union { 266 src_t f; 267 src_rep_t i; 268 } rep = {.f = x}; 269 return rep.i; 270 } 271 272 static inline dst_t dstFromRep(dst_rep_t x) { 273 const union { 274 dst_t f; 275 dst_rep_t i; 276 } rep = {.i = x}; 277 return rep.f; 278 } 279 280 #endif // _FP_TRUNC_EMIT 281 282 #define __truncXfYf2__ _FP_TRUNC_PAIR(__truncXfYf2__) 283 284 #if defined SRC_DOUBLE && defined DST_SINGLE && !defined FP_TRUNC_IMPL_DFSF_EMITTED 285 #define FP_TRUNC_IMPL_DFSF_EMITTED 286 #define _FP_TRUNC_IMPL_EMIT 1 287 #elif defined SRC_QUAD && defined DST_DOUBLE && !defined FP_TRUNC_IMPL_TFDF_EMITTED 288 #define FP_TRUNC_IMPL_TFDF_EMITTED 289 #define _FP_TRUNC_IMPL_EMIT 1 290 #elif defined SRC_QUAD && defined DST_SINGLE && !defined FP_TRUNC_IMPL_TFSF_EMITTED 291 #define FP_TRUNC_IMPL_TFSF_EMITTED 292 #define _FP_TRUNC_IMPL_EMIT 1 293 #endif 294 295 #ifdef _FP_TRUNC_IMPL_EMIT 296 #undef _FP_TRUNC_IMPL_EMIT 297 298 // The destination type may use a usual IEEE-754 interchange format or Intel 299 // 80-bit format. In particular, for the destination type dstSigFracBits may be 300 // not equal to dstSigBits. The source type is assumed to be one of IEEE-754 301 // standard types. 302 static inline dst_t __truncXfYf2__(src_t a) { 303 // Various constants whose values follow from the type parameters. 304 // Any reasonable optimizer will fold and propagate all of these. 305 const int srcInfExp = (1 << srcExpBits) - 1; 306 const int srcExpBias = srcInfExp >> 1; 307 308 const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigFracBits; 309 const src_rep_t roundMask = 310 (SRC_REP_C(1) << (srcSigFracBits - dstSigFracBits)) - 1; 311 const src_rep_t halfway = SRC_REP_C(1) 312 << (srcSigFracBits - dstSigFracBits - 1); 313 const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigFracBits - 1); 314 const src_rep_t srcNaNCode = srcQNaN - 1; 315 316 const int dstInfExp = (1 << dstExpBits) - 1; 317 const int dstExpBias = dstInfExp >> 1; 318 const int overflowExponent = srcExpBias + dstInfExp - dstExpBias; 319 320 const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigFracBits - 1); 321 const dst_rep_t dstNaNCode = dstQNaN - 1; 322 323 const src_rep_t aRep = srcToRep(a); 324 const src_rep_t srcSign = extract_sign_from_src(aRep); 325 const src_rep_t srcExp = extract_exp_from_src(aRep); 326 const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep); 327 328 dst_rep_t dstSign = srcSign; 329 dst_rep_t dstExp; 330 dst_rep_t dstSigFrac; 331 332 // Same size exponents and a's significand tail is 0. 333 // The significand can be truncated and the exponent can be copied over. 334 const int sigFracTailBits = srcSigFracBits - dstSigFracBits; 335 if (srcExpBits == dstExpBits && 336 ((aRep >> sigFracTailBits) << sigFracTailBits) == aRep) { 337 dstExp = srcExp; 338 dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits); 339 return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac)); 340 } 341 342 const int dstExpCandidate = ((int)srcExp - srcExpBias) + dstExpBias; 343 if (dstExpCandidate >= 1 && dstExpCandidate < dstInfExp) { 344 // The exponent of a is within the range of normal numbers in the 345 // destination format. We can convert by simply right-shifting with 346 // rounding and adjusting the exponent. 347 dstExp = dstExpCandidate; 348 dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits); 349 350 const src_rep_t roundBits = srcSigFrac & roundMask; 351 // Round to nearest. 352 if (roundBits > halfway) 353 dstSigFrac++; 354 // Tie to even. 355 else if (roundBits == halfway) 356 dstSigFrac += dstSigFrac & 1; 357 358 // Rounding has changed the exponent. 359 if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) { 360 dstExp += 1; 361 dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits); 362 } 363 } else if (srcExp == srcInfExp && srcSigFrac) { 364 // a is NaN. 365 // Conjure the result by beginning with infinity, setting the qNaN 366 // bit and inserting the (truncated) trailing NaN field. 367 dstExp = dstInfExp; 368 dstSigFrac = dstQNaN; 369 dstSigFrac |= ((srcSigFrac & srcNaNCode) >> sigFracTailBits) & dstNaNCode; 370 } else if ((int)srcExp >= overflowExponent) { 371 dstExp = dstInfExp; 372 dstSigFrac = 0; 373 } else { 374 // a underflows on conversion to the destination type or is an exact 375 // zero. The result may be a denormal or zero. Extract the exponent 376 // to get the shift amount for the denormalization. 377 src_rep_t significand = srcSigFrac; 378 int shift = srcExpBias - dstExpBias - srcExp; 379 380 if (srcExp) { 381 // Set the implicit integer bit if the source is a normal number. 382 significand |= srcMinNormal; 383 shift += 1; 384 } 385 386 // Right shift by the denormalization amount with sticky. 387 if (shift > srcSigFracBits) { 388 dstExp = 0; 389 dstSigFrac = 0; 390 } else { 391 dstExp = 0; 392 const bool sticky = shift && ((significand << (srcBits - shift)) != 0); 393 src_rep_t denormalizedSignificand = significand >> shift | sticky; 394 dstSigFrac = denormalizedSignificand >> sigFracTailBits; 395 const src_rep_t roundBits = denormalizedSignificand & roundMask; 396 // Round to nearest 397 if (roundBits > halfway) 398 dstSigFrac++; 399 // Ties to even 400 else if (roundBits == halfway) 401 dstSigFrac += dstSigFrac & 1; 402 403 // Rounding has changed the exponent. 404 if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) { 405 dstExp += 1; 406 dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits); 407 } 408 } 409 } 410 411 return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac)); 412 } 413 414 #endif // _FP_TRUNC_IMPL_EMIT