From fe9b24662430a3369f1f2af51c62ca5c21a2ad09 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 7 Mar 2026 00:00:34 +0000 Subject: [PATCH] Improve Lehmer GCD: use 2-digit (64-bit) approximation with __int128 overflow safety Co-authored-by: levnach <5377127+levnach@users.noreply.github.com> --- src/util/mpz.cpp | 138 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 102 insertions(+), 36 deletions(-) diff --git a/src/util/mpz.cpp b/src/util/mpz.cpp index 10c345b0c..98570269a 100644 --- a/src/util/mpz.cpp +++ b/src/util/mpz.cpp @@ -1103,10 +1103,33 @@ void mpz_manager::gcd(mpz const & a, mpz const & b, mpz & c) { #endif // LS_BINARY_GCD #ifdef LEHMER_GCD - // For now, it only works if sizeof(digit_t) == sizeof(unsigned) + // Works only when sizeof(digit_t) == sizeof(unsigned) (32-bit digits). static_assert(sizeof(digit_t) == sizeof(unsigned), ""); - - int64_t a_hat, b_hat, A, B, C, D, T, q, a_sz, b_sz; + + // ----------------------------------------------------------------------- + // 2-digit Lehmer GCD + // + // Classic Lehmer's algorithm builds 1-digit (32-bit) approximations + // â and b̂ from the leading digits of a1 and b1, then simulates the + // Euclidean algorithm on those small values to accumulate a 2×2 matrix + // [A B; C D] that can be applied to the actual big integers in one shot, + // saving expensive big-integer divisions. + // + // This version extends that idea to 2-digit (64-bit) approximations, + // which roughly doubles the number of simulated Euclidean steps per outer + // iteration and thus halves the number of costly big-integer operations. + // + // Overflow safety: + // With a 62-bit â (the 64-bit two-digit value shifted right by 2 to + // fit in a signed int64_t), the Fibonacci-worst-case inner loop runs + // for at most ~90 steps before b̂ reaches 0. At that point the matrix + // entries reach Fib(90) ≈ 2.88 × 10^18 < INT64_MAX. The intermediate + // PRODUCTS (q · C etc.), however, can exceed INT64_MAX in rare cases. + // We use __int128 (available on GCC/Clang) for those multiplications + // and check for overflow; on other compilers the 1-digit (original) + // path is used as a safe fallback. + // ----------------------------------------------------------------------- + mpz a1, b1, t, r, tmp; set(a1, a); set(b1, b); @@ -1118,8 +1141,8 @@ void mpz_manager::gcd(mpz const & a, mpz const & b, mpz & c) { SASSERT(ge(a1, b1)); if (is_small(b1)) { if (is_small(a1)) { - unsigned r = u_gcd(a1.m_val, b1.m_val); - set(c, r); + unsigned r_val = u_gcd(a1.m_val, b1.m_val); + set(c, r_val); break; } else { @@ -1135,60 +1158,103 @@ void mpz_manager::gcd(mpz const & a, mpz const & b, mpz & c) { } SASSERT(!is_small(a1)); SASSERT(!is_small(b1)); - a_sz = a1.m_ptr->m_size; - b_sz = b1.m_ptr->m_size; + + int64_t a_sz = a1.m_ptr->m_size; + int64_t b_sz = b1.m_ptr->m_size; SASSERT(b_sz <= a_sz); - a_hat = a1.m_ptr->m_digits[a_sz - 1]; - b_hat = (b_sz == a_sz) ? b1.m_ptr->m_digits[b_sz - 1] : 0; - A = 1; - B = 0; - C = 0; - D = 1; + + int64_t a_hat, b_hat, A, B, C, D, T, q; + +#if defined(__GNUC__) || defined(__clang__) + // ---- 2-digit (64-bit) path ---- + // Build 62-bit approximations from the top two 32-bit digits. + // Shift right by 2 so they fit in int64_t (bit 63 unused for sign, + // bit 62 unused as arithmetic buffer). + { + uint64_t ah = (uint64_t)a1.m_ptr->m_digits[a_sz - 1]; + uint64_t al = (a_sz >= 2) ? (uint64_t)a1.m_ptr->m_digits[a_sz - 2] : 0; + a_hat = (int64_t)(((ah << 32) | al) >> 2); + } + { + if (b_sz == a_sz) { + // Same number of digits: take top two digits of b1. + uint64_t bh = (uint64_t)b1.m_ptr->m_digits[b_sz - 1]; + uint64_t bl = (b_sz >= 2) ? (uint64_t)b1.m_ptr->m_digits[b_sz - 2] : 0; + b_hat = (int64_t)(((bh << 32) | bl) >> 2); + } else if (b_sz == a_sz - 1) { + // b1 has one fewer digit: its leading digit sits at the + // same scale as a1's *second* digit. In the 64-bit window + // that is just a 32-bit value (upper half is 0). + b_hat = (int64_t)((uint64_t)b1.m_ptr->m_digits[b_sz - 1] >> 2); + } else { + // b1 is more than one digit smaller → b_hat ≈ 0. + // The inner loop will immediately set B=0 and we fall + // through to a single Euclidean step. + b_hat = 0; + } + } + + A = 1; B = 0; C = 0; D = 1; while (true) { - // Loop invariants - SASSERT(a_hat + A <= static_cast(UINT_MAX) + 1); - SASSERT(a_hat + B < static_cast(UINT_MAX) + 1); - SASSERT(b_hat + C < static_cast(UINT_MAX) + 1); - SASSERT(b_hat + D <= static_cast(UINT_MAX) + 1); - // overflows can't happen since I'm using int64 - if (b_hat + C == 0 || b_hat + D == 0) + if (b_hat + C <= 0 || b_hat + D <= 0) break; - q = (a_hat + A)/(b_hat + C); - if (q != (a_hat + B)/(b_hat + D)) + q = (a_hat + A) / (b_hat + C); + if (q != (a_hat + B) / (b_hat + D)) break; - T = A - q*C; - A = C; - C = T; - T = B - q*D; - B = D; - D = T; - T = a_hat - q*b_hat; + // Use __int128 for the cofactor updates to avoid int64_t + // overflow in the rare cases where q is large. + __int128 T128; + T128 = (__int128)A - (__int128)q * C; + if (T128 < INT64_MIN || T128 > INT64_MAX) break; + T = (int64_t)T128; + A = C; C = T; + T128 = (__int128)B - (__int128)q * D; + if (T128 < INT64_MIN || T128 > INT64_MAX) break; + T = (int64_t)T128; + B = D; D = T; + T = a_hat - q * b_hat; a_hat = b_hat; b_hat = T; } +#else + // ---- 1-digit (32-bit) fallback for compilers without __int128 ---- + a_hat = a1.m_ptr->m_digits[a_sz - 1]; + b_hat = (b_sz == a_sz) ? b1.m_ptr->m_digits[b_sz - 1] : 0; + A = 1; B = 0; C = 0; D = 1; + while (true) { + if (b_hat + C == 0 || b_hat + D == 0) + break; + q = (a_hat + A) / (b_hat + C); + if (q != (a_hat + B) / (b_hat + D)) + break; + T = A - q*C; A = C; C = T; + T = B - q*D; B = D; D = T; + T = a_hat - q*b_hat; + a_hat = b_hat; b_hat = T; + } +#endif // __GNUC__ || __clang__ + SASSERT(ge(a1, b1)); if (B == 0) { + // No useful matrix was accumulated; do one Euclidean step. rem(a1, b1, t); swap(a1, b1); swap(b1, t); SASSERT(ge(a1, b1)); } else { - // t <- A*a1 + // Apply the accumulated 2×2 matrix to the actual big integers. + // t = A*a1 + B*b1 set(tmp, A); - mul(a1, tmp, t); - // t <- t + B*b1 + mul(a1, tmp, t); set(tmp, B); addmul(t, tmp, b1, t); - // r <- C*a1 + // r = C*a1 + D*b1 set(tmp, C); mul(a1, tmp, r); - // r <- r + D*b1 set(tmp, D); addmul(r, tmp, b1, r); - // a <- t swap(a1, t); - // b <- r swap(b1, r); SASSERT(ge(a1, b1)); }