From 43890666029748669645e22d4eaf878964efdbe7 Mon Sep 17 00:00:00 2001 From: Fredrik Holmqvist Date: Wed, 22 Jul 2020 10:10:16 +0200 Subject: [PATCH] Redo Update GMP to 6.2.0 GCC had a horrible .gitignore, untracked files were not applied --- gcc/gmp/asl.h | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/gmp.pc.in | 11 +++++++++++ gcc/gmp/gmpxx.pc.in | 12 ++++++++++++ gcc/gmp/mini-gmp/mini-mpq.c | 554 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/mini-mpq.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpz/lucmod.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpz/stronglucas.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2-1.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2-2.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2-3.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2-4.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2-5.c | 39 +++++++++++++++++++++++++++++++++++++++ gcc/gmp/tune/hgcd2.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-lucm.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-mpq_addsub.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-mpq_double.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-mpq_muldiv.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-mpq_muldiv_2exp.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mini-gmp/tests/t-mpq_str.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/bdiv_q_1.asm | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/aorsorrlsh1_n.asm | 43 +++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/aorsorrlsh2_n.asm | 43 +++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/aorsorrlshC_n.asm | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/bdiv_q_1.asm | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/gcd_11.asm | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/gcd_22.asm | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/lshiftc.asm | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/rsh1aors_n.asm | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/sqr_diag_addlsh1.asm | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/compute_powtab.c | 373 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/fib2m.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/gcd_11.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/gcd_22.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/sbpi1_bdiv_r.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/generic/strongfibo.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/ia64/gcd_11.asm | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/sparc64/gcd_11.asm | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/gcd_11.asm | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/gcd_11.asm | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/gcd_22.asm | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/cxx/t-ops2.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/cxx/t-ops2f.cc | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/cxx/t-ops2qf.cc | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/cxx/t-ops2z.cc | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/devel/addmul_N.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/devel/cnd_aors_n.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/devel/mul_N.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/devel/primes.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/devel/sqrtrem_1_2.c | 401 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/mpn/t-fib2m.c | 344 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/mpn/t-gcd_11.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/mpn/t-gcd_22.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/mpn/t-gcdext_1.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/tests/mpz/t-lucm.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/alpha/ev67/gcd_11.asm | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v5/gcd_11.asm | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v6t2/gcd_11.asm | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v6t2/gcd_22.asm | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/cora53/cnd_aors_n.asm | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/cora53/gmp-mparam.h | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/cora57/gmp-mparam.h | 187 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/cora72/gmp-mparam.h | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/cora73/gmp-mparam.h | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm64/xgene1/gmp-mparam.h | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/mips64/hilo/addmul_1.asm | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/mips64/hilo/mul_1.asm | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/mips64/hilo/sqr_diagonal.asm | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/mips64/hilo/submul_1.asm | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/mips64/hilo/umul.asm | 45 +++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/gcd_11.asm | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/riscv/64/aors_n.asm | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/riscv/64/aorsmul_1.asm | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/riscv/64/mul_1.asm | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/bd4/gmp-mparam.h | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/bt1/gmp-mparam.h | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/bt2/gmp-mparam.h | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/coreibwl/gmp-mparam.h | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/goldmont/gmp-mparam.h | 219 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/k7/gcd_11.asm | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/p6/gcd_11.asm | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/silvermont/gmp-mparam.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/skylake/gmp-mparam.h | 211 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/zn1/gmp-mparam.h | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86/zn2/gmp-mparam.h | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/atom/cnd_add_n.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/atom/cnd_sub_n.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd1/addmul_2.asm | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd1/aorrlsh_n.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd1/aors_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd1/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd2/gcd_11.asm | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd2/gcd_22.asm | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd4/aorrlsh_n.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd4/gcd_11.asm | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd4/gcd_22.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bd4/gmp-mparam.h | 266 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/aors_n.asm | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/aorsmul_1.asm | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/copyd.asm | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/copyi.asm | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/gcd_11.asm | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/gcd_22.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/gmp-mparam.h | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/mul_1.asm | 241 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/mul_basecase.asm | 486 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/redc_1.asm | 507 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt1/sqr_basecase.asm | 565 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/com.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/copyd.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/copyi.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/gcd_22.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/bt2/gmp-mparam.h | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/core2/com.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/core2/gcd_11.asm | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/core2/gcd_22.asm | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/core2/hamdist.asm | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/core2/logops_n.asm | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm | 395 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreihwl/aors_n.asm | 261 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreihwl/gcd_22.asm | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/coreisbr/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/fat/addmul_2.c | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/aors_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/aorsmul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/gmp-mparam.h | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/mul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/goldmont/redc_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/k10/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/k10/gcd_22.asm | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/k8/addmul_2.asm | 195 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/k8/bdiv_q_1.asm | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/nano/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/addmul_2.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/aorsmul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/mul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/mul_2.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/mul_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/mullo_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/redc_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/pentium4/sqr_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/aors_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/aorsmul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/gmp-mparam.h | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/hamdist.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/lshift.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/lshiftc.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/mul_1.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/mul_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/mullo_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/popcount.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/rshift.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/silvermont/sqr_basecase.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/aorrlsh1_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/aorrlsh_n.asm | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/aorsmul_1.asm | 165 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/com.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/copyd.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/copyi.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/gcd_11.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/gcd_22.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/gmp-mparam.h | 280 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/hamdist.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/lshift.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/lshiftc.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/mul_1.asm | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/mul_basecase.asm | 455 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/mullo_basecase.asm | 299 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/popcount.asm | 38 ++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/rshift.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm | 507 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/sqr_basecase.asm | 482 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen/sublsh1_n.asm | 37 +++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/x86_64/zen2/gmp-mparam.h | 276 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm | 36 ++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora17/addmul_1.asm | 34 ++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora17/gmp-mparam.h | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora17/mul_1.asm | 34 ++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora17/submul_1.asm | 34 ++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora5/gmp-mparam.h | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm | 36 ++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h | 253 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/mul_1.asm | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/mul_2.asm | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm | 415 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gcc/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm | 555 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 206 files changed, 28100 insertions(+) diff --git a/gcc/gmp/asl.h b/gcc/gmp/asl.h new file mode 100644 index 0000000..64deea0 100644 --- /dev/null +++ b/gcc/gmp/asl.h @@ -1,0 +1,127 @@ +/* asl.h -- artificially small limbs support by means of C++ operator + overloading. + +Copyright 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include +#include +// #include + +#ifndef GMP_ASSERT_ALWAYS +#define GMP_ASSERT_ALWAYS(cc) do {if (!(cc)) abort();} while (0) +#endif + +// Missing: post++ post-- ++pre --prec bool(limb) !limb + +#ifndef GMP_LIMB_BITS +#define GMP_LIMB_BITS 4 +#endif + +#define GMP_NUMB_MASK (2 * (1ul << (GMP_LIMB_BITS - 1)) - 1) + +#define BINOP_MASK(op, type) \ + mp_limb_t& operator op##=(const type& rhs) { \ + limbo = (limbo op rhs.limbo) & GMP_NUMB_MASK; \ + return *this; \ + } +#define BINOP_NOMASK(op, type) \ + mp_limb_t& operator op##=(const type& rhs) { \ + limbo = limbo op rhs.limbo; \ + return *this; \ + } + +typedef std::conditional<(GMP_NUMB_MASK <= 0xffff), uint16_t, uint32_t >::type type24; +typedef std::conditional<(GMP_NUMB_MASK <= 0xff), uint8_t, type24>::type mtype; + +class mp_limb_t { +public: + mp_limb_t() {} // put random garbage in limbo? + mp_limb_t(const unsigned int rhs) { limbo = rhs & GMP_NUMB_MASK; } + // mp_limb_t(const mp_limb_t& rhs) { limbo = rhs.limbo; } // Causes havoc + BINOP_MASK(+, mp_limb_t) + BINOP_MASK(-, mp_limb_t) + BINOP_MASK(*, mp_limb_t) + BINOP_NOMASK(/, mp_limb_t) + BINOP_NOMASK(%, mp_limb_t) + BINOP_NOMASK(&, mp_limb_t) + BINOP_NOMASK(|, mp_limb_t) + BINOP_NOMASK(^, mp_limb_t) + mp_limb_t& operator<<=(const unsigned int rhs) { + GMP_ASSERT_ALWAYS (rhs < GMP_LIMB_BITS); + limbo = (limbo << rhs) & GMP_NUMB_MASK; + return *this; + } + mp_limb_t& operator>>=(const unsigned int rhs) { + GMP_ASSERT_ALWAYS (rhs < GMP_LIMB_BITS); + limbo = limbo >> rhs; + return *this; + } + mp_limb_t operator-() { + return static_cast((-limbo) & GMP_NUMB_MASK); + // mp_limb_t x; x.limbo = (-limbo) & GMP_NUMB_MASK; return x; + } + mp_limb_t operator~() { + return static_cast((~limbo) & GMP_NUMB_MASK); + // mp_limb_t x; x.limbo = (~limbo) & GMP_NUMB_MASK; return x; + } + operator unsigned int() const { return limbo; } + operator int() const { return limbo; } + +#define RELOP(op) \ + inline bool operator op(const mp_limb_t rhs) { \ + return limbo op rhs.limbo; \ + } + RELOP(==) + RELOP(!=) + RELOP(<) + RELOP(>) + RELOP(<=) + RELOP(>=) + +private: + mtype limbo; +}; + +#define BINOP2(op, type) \ + inline mp_limb_t operator op(mp_limb_t lhs, const type& rhs) { \ + lhs op##= rhs; \ + return lhs; \ + } + +BINOP2(+, mp_limb_t) +BINOP2(-, mp_limb_t) +BINOP2(*, mp_limb_t) +BINOP2(/, mp_limb_t) +BINOP2(%, mp_limb_t) +BINOP2(&, mp_limb_t) +BINOP2(|, mp_limb_t) +BINOP2(^, mp_limb_t) +BINOP2(<<, unsigned int) +BINOP2(>>, unsigned int) diff --git a/gcc/gmp/gmp.pc.in b/gcc/gmp/gmp.pc.in new file mode 100644 index 0000000..bf1c799 100644 --- /dev/null +++ b/gcc/gmp/gmp.pc.in @@ -1,0 +1,11 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +includedir=@includedir@ +libdir=@libdir@ + +Name: @PACKAGE_NAME@ +Description: GNU Multiple Precision Arithmetic Library +URL: https://gmplib.org +Version: @PACKAGE_VERSION@ +Cflags: -I${includedir} +Libs: -L${libdir} -lgmp diff --git a/gcc/gmp/gmpxx.pc.in b/gcc/gmp/gmpxx.pc.in new file mode 100644 index 0000000..181cc70 100644 --- /dev/null +++ b/gcc/gmp/gmpxx.pc.in @@ -1,0 +1,12 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +includedir=@includedir@ +libdir=@libdir@ + +Name: @PACKAGE_NAME@ C++ +Description: GNU Multiple Precision Arithmetic Library (C++ bindings) +URL: https://gmplib.org +Version: @PACKAGE_VERSION@ +Requires: gmp +Cflags: -I${includedir} +Libs: -L${libdir} -lgmpxx diff --git a/gcc/gmp/mini-gmp/mini-mpq.c b/gcc/gmp/mini-gmp/mini-mpq.c new file mode 100644 index 0000000..fd9b439 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/mini-mpq.c @@ -1,0 +1,554 @@ +/* mini-mpq, a minimalistic implementation of a GNU GMP subset. + + Contributed to the GNU project by Marco Bodrato + + Acknowledgment: special thanks to Bradley Lucier for his comments + to the preliminary version of this code. + +Copyright 2018, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include +#include + +#include "mini-mpq.h" + +#ifndef GMP_LIMB_HIGHBIT +/* Define macros and static functions already defined by mini-gmp.c */ +#define GMP_LIMB_BITS (sizeof(mp_limb_t) * CHAR_BIT) +#define GMP_LIMB_HIGHBIT ((mp_limb_t) 1 << (GMP_LIMB_BITS - 1)) +#define GMP_NEG_CAST(T,x) (-((T)((x) + 1) - 1)) +#define GMP_MIN(a, b) ((a) < (b) ? (a) : (b)) + +static mpz_srcptr +mpz_roinit_normal_n (mpz_t x, mp_srcptr xp, mp_size_t xs) +{ + x->_mp_alloc = 0; + x->_mp_d = (mp_ptr) xp; + x->_mp_size = xs; + return x; +} + +static void +gmp_die (const char *msg) +{ + fprintf (stderr, "%s\n", msg); + abort(); +} +#endif + + +/* MPQ helper functions */ +static mpq_srcptr +mpq_roinit_normal_nn (mpq_t x, mp_srcptr np, mp_size_t ns, + mp_srcptr dp, mp_size_t ds) +{ + mpz_roinit_normal_n (mpq_numref(x), np, ns); + mpz_roinit_normal_n (mpq_denref(x), dp, ds); + return x; +} + +static mpq_srcptr +mpq_roinit_zz (mpq_t x, mpz_srcptr n, mpz_srcptr d) +{ + return mpq_roinit_normal_nn (x, n->_mp_d, n->_mp_size, + d->_mp_d, d->_mp_size); +} + +static void +mpq_nan_init (mpq_t x) +{ + mpz_init (mpq_numref (x)); + mpz_init (mpq_denref (x)); +} + +void +mpq_init (mpq_t x) +{ + mpz_init (mpq_numref (x)); + mpz_init_set_ui (mpq_denref (x), 1); +} + +void +mpq_clear (mpq_t x) +{ + mpz_clear (mpq_numref (x)); + mpz_clear (mpq_denref (x)); +} + +static void +mpq_canonical_sign (mpq_t r) +{ + int cmp = mpq_denref (r)->_mp_size; + if (cmp <= 0) + { + if (cmp == 0) + gmp_die("mpq: Fraction with zero denominator."); + mpz_neg (mpq_denref (r), mpq_denref (r)); + mpz_neg (mpq_numref (r), mpq_numref (r)); + } +} + +static void +mpq_helper_canonicalize (mpq_t r, const mpz_t num, const mpz_t den, mpz_t g) +{ + if (num->_mp_size == 0) + mpq_set_ui (r, 0, 1); + else + { + mpz_gcd (g, num, den); + mpz_tdiv_q (mpq_numref (r), num, g); + mpz_tdiv_q (mpq_denref (r), den, g); + mpq_canonical_sign (r); + } +} + +void +mpq_canonicalize (mpq_t r) +{ + mpz_t t; + + mpz_init (t); + mpq_helper_canonicalize (r, mpq_numref (r), mpq_denref (r), t); + mpz_clear (t); +} + +void +mpq_swap (mpq_t a, mpq_t b) +{ + mpz_swap (mpq_numref (a), mpq_numref (b)); + mpz_swap (mpq_denref (a), mpq_denref (b)); +} + + +/* MPQ assignment and conversions. */ +void +mpz_set_q (mpz_t r, const mpq_t q) +{ + mpz_tdiv_q (r, mpq_numref (q), mpq_denref (q)); +} + +void +mpq_set (mpq_t r, const mpq_t q) +{ + mpz_set (mpq_numref (r), mpq_numref (q)); + mpz_set (mpq_denref (r), mpq_denref (q)); +} + +void +mpq_set_ui (mpq_t r, unsigned long n, unsigned long d) +{ + mpz_set_ui (mpq_numref (r), n); + mpz_set_ui (mpq_denref (r), d); +} + +void +mpq_set_si (mpq_t r, signed long n, unsigned long d) +{ + mpz_set_si (mpq_numref (r), n); + mpz_set_ui (mpq_denref (r), d); +} + +void +mpq_set_z (mpq_t r, const mpz_t n) +{ + mpz_set_ui (mpq_denref (r), 1); + mpz_set (mpq_numref (r), n); +} + +void +mpq_set_num (mpq_t r, const mpz_t z) +{ + mpz_set (mpq_numref (r), z); +} + +void +mpq_set_den (mpq_t r, const mpz_t z) +{ + mpz_set (mpq_denref (r), z); +} + +void +mpq_get_num (mpz_t r, const mpq_t q) +{ + mpz_set (r, mpq_numref (q)); +} + +void +mpq_get_den (mpz_t r, const mpq_t q) +{ + mpz_set (r, mpq_denref (q)); +} + + +/* MPQ comparisons and the like. */ +int +mpq_cmp (const mpq_t a, const mpq_t b) +{ + mpz_t t1, t2; + int res; + + mpz_init (t1); + mpz_init (t2); + mpz_mul (t1, mpq_numref (a), mpq_denref (b)); + mpz_mul (t2, mpq_numref (b), mpq_denref (a)); + res = mpz_cmp (t1, t2); + mpz_clear (t1); + mpz_clear (t2); + + return res; +} + +int +mpq_cmp_z (const mpq_t a, const mpz_t b) +{ + mpz_t t; + int res; + + mpz_init (t); + mpz_mul (t, b, mpq_denref (a)); + res = mpz_cmp (mpq_numref (a), t); + mpz_clear (t); + + return res; +} + +int +mpq_equal (const mpq_t a, const mpq_t b) +{ + return (mpz_cmp (mpq_numref (a), mpq_numref (b)) == 0) && + (mpz_cmp (mpq_denref (a), mpq_denref (b)) == 0); +} + +int +mpq_cmp_ui (const mpq_t q, unsigned long n, unsigned long d) +{ + mpq_t t; + assert (d != 0); + if (ULONG_MAX <= GMP_LIMB_MAX) { + mp_limb_t nl = n, dl = d; + return mpq_cmp (q, mpq_roinit_normal_nn (t, &nl, n != 0, &dl, 1)); + } else { + int ret; + + mpq_init (t); + mpq_set_ui (t, n, d); + ret = mpq_cmp (q, t); + mpq_clear (t); + + return ret; + } +} + +int +mpq_cmp_si (const mpq_t q, signed long n, unsigned long d) +{ + assert (d != 0); + + if (n >= 0) + return mpq_cmp_ui (q, n, d); + else + { + mpq_t t; + + if (ULONG_MAX <= GMP_LIMB_MAX) + { + mp_limb_t nl = GMP_NEG_CAST (unsigned long, n), dl = d; + return mpq_cmp (q, mpq_roinit_normal_nn (t, &nl, -1, &dl, 1)); + } + else + { + unsigned long l_n = GMP_NEG_CAST (unsigned long, n); + + mpq_roinit_normal_nn (t, mpq_numref (q)->_mp_d, - mpq_numref (q)->_mp_size, + mpq_denref (q)->_mp_d, mpq_denref (q)->_mp_size); + return - mpq_cmp_ui (t, l_n, d); + } + } +} + +int +mpq_sgn (const mpq_t a) +{ + return mpz_sgn (mpq_numref (a)); +} + + +/* MPQ arithmetic. */ +void +mpq_abs (mpq_t r, const mpq_t q) +{ + mpz_abs (mpq_numref (r), mpq_numref (q)); + mpz_set (mpq_denref (r), mpq_denref (q)); +} + +void +mpq_neg (mpq_t r, const mpq_t q) +{ + mpz_neg (mpq_numref (r), mpq_numref (q)); + mpz_set (mpq_denref (r), mpq_denref (q)); +} + +void +mpq_add (mpq_t r, const mpq_t a, const mpq_t b) +{ + mpz_t t; + + mpz_init (t); + mpz_gcd (t, mpq_denref (a), mpq_denref (b)); + if (mpz_cmp_ui (t, 1) == 0) + { + mpz_mul (t, mpq_numref (a), mpq_denref (b)); + mpz_addmul (t, mpq_numref (b), mpq_denref (a)); + mpz_mul (mpq_denref (r), mpq_denref (a), mpq_denref (b)); + mpz_swap (mpq_numref (r), t); + } + else + { + mpz_t x, y; + mpz_init (x); + mpz_init (y); + + mpz_tdiv_q (x, mpq_denref (b), t); + mpz_tdiv_q (y, mpq_denref (a), t); + mpz_mul (x, mpq_numref (a), x); + mpz_addmul (x, mpq_numref (b), y); + + mpz_gcd (t, x, t); + mpz_tdiv_q (mpq_numref (r), x, t); + mpz_tdiv_q (x, mpq_denref (b), t); + mpz_mul (mpq_denref (r), x, y); + + mpz_clear (x); + mpz_clear (y); + } + mpz_clear (t); +} + +void +mpq_sub (mpq_t r, const mpq_t a, const mpq_t b) +{ + mpq_t t; + + mpq_roinit_normal_nn (t, mpq_numref (b)->_mp_d, - mpq_numref (b)->_mp_size, + mpq_denref (b)->_mp_d, mpq_denref (b)->_mp_size); + mpq_add (r, a, t); +} + +void +mpq_div (mpq_t r, const mpq_t a, const mpq_t b) +{ + mpq_t t; + mpq_mul (r, a, mpq_roinit_zz (t, mpq_denref (b), mpq_numref (b))); +} + +void +mpq_mul (mpq_t r, const mpq_t a, const mpq_t b) +{ + mpq_t t; + mpq_nan_init (t); + + if (a != b) { + mpz_t g; + + mpz_init (g); + mpq_helper_canonicalize (t, mpq_numref (a), mpq_denref (b), g); + mpq_helper_canonicalize (r, mpq_numref (b), mpq_denref (a), g); + mpz_clear (g); + + a = r; + b = t; + } + + mpz_mul (mpq_numref (r), mpq_numref (a), mpq_numref (b)); + mpz_mul (mpq_denref (r), mpq_denref (a), mpq_denref (b)); + mpq_clear (t); +} + +void +mpq_div_2exp (mpq_t r, const mpq_t q, mp_bitcnt_t e) +{ + mp_bitcnt_t z = mpz_scan1 (mpq_numref (q), 0); + z = GMP_MIN (z, e); + mpz_mul_2exp (mpq_denref (r), mpq_denref (q), e - z); + mpz_tdiv_q_2exp (mpq_numref (r), mpq_numref (q), z); +} + +void +mpq_mul_2exp (mpq_t r, const mpq_t q, mp_bitcnt_t e) +{ + mp_bitcnt_t z = mpz_scan1 (mpq_denref (q), 0); + z = GMP_MIN (z, e); + mpz_mul_2exp (mpq_numref (r), mpq_numref (q), e - z); + mpz_tdiv_q_2exp (mpq_denref (r), mpq_denref (q), z); +} + +void +mpq_inv (mpq_t r, const mpq_t q) +{ + mpq_set (r, q); + mpz_swap (mpq_denref (r), mpq_numref (r)); + mpq_canonical_sign (r); +} + + +/* MPQ to/from double. */ +void +mpq_set_d (mpq_t r, double x) +{ + mpz_set_ui (mpq_denref (r), 1); + + /* x != x is true when x is a NaN, and x == x * 0.5 is true when x is + zero or infinity. */ + if (x == x * 0.5 || x != x) + mpq_numref (r)->_mp_size = 0; + else + { + double B; + mp_bitcnt_t e; + + B = 4.0 * (double) (GMP_LIMB_HIGHBIT >> 1); + for (e = 0; x != x + 0.5; e += GMP_LIMB_BITS) + x *= B; + + mpz_set_d (mpq_numref (r), x); + mpq_div_2exp (r, r, e); + } +} + +double +mpq_get_d (const mpq_t u) +{ + mp_bitcnt_t ne, de, ee; + mpz_t z; + double B, ret; + + ne = mpz_sizeinbase (mpq_numref (u), 2); + de = mpz_sizeinbase (mpq_denref (u), 2); + + ee = CHAR_BIT * sizeof (double); + if (de == 1 || ne > de + ee) + ee = 0; + else + ee = (ee + de - ne) / GMP_LIMB_BITS + 1; + + mpz_init (z); + mpz_mul_2exp (z, mpq_numref (u), ee * GMP_LIMB_BITS); + mpz_tdiv_q (z, z, mpq_denref (u)); + ret = mpz_get_d (z); + mpz_clear (z); + + B = 4.0 * (double) (GMP_LIMB_HIGHBIT >> 1); + for (B = 1 / B; ee != 0; --ee) + ret *= B; + + return ret; +} + + +/* MPQ and strings/streams. */ +char * +mpq_get_str (char *sp, int base, const mpq_t q) +{ + char *res; + char *rden; + size_t len; + + res = mpz_get_str (sp, base, mpq_numref (q)); + if (res == NULL || mpz_cmp_ui (mpq_denref (q), 1) == 0) + return res; + + len = strlen (res) + 1; + rden = sp ? sp + len : NULL; + rden = mpz_get_str (rden, base, mpq_denref (q)); + assert (rden != NULL); + + if (sp == NULL) { + void * (*gmp_reallocate_func) (void *, size_t, size_t); + void (*gmp_free_func) (void *, size_t); + size_t lden; + + mp_get_memory_functions (NULL, &gmp_reallocate_func, &gmp_free_func); + lden = strlen (rden) + 1; + res = (char *) gmp_reallocate_func (res, 0, (lden + len) * sizeof (char)); + memcpy (res + len, rden, lden); + gmp_free_func (rden, 0); + } + + res [len - 1] = '/'; + return res; +} + +size_t +mpq_out_str (FILE *stream, int base, const mpq_t x) +{ + char * str; + size_t len; + void (*gmp_free_func) (void *, size_t); + + str = mpq_get_str (NULL, base, x); + len = strlen (str); + len = fwrite (str, 1, len, stream); + mp_get_memory_functions (NULL, NULL, &gmp_free_func); + gmp_free_func (str, 0); + return len; +} + +int +mpq_set_str (mpq_t r, const char *sp, int base) +{ + const char *slash; + + slash = strchr (sp, '/'); + if (slash == NULL) { + mpz_set_ui (mpq_denref(r), 1); + return mpz_set_str (mpq_numref(r), sp, base); + } else { + char *num; + size_t numlen; + int ret; + void * (*gmp_allocate_func) (size_t); + void (*gmp_free_func) (void *, size_t); + + mp_get_memory_functions (&gmp_allocate_func, NULL, &gmp_free_func); + numlen = slash - sp; + num = (char *) gmp_allocate_func ((numlen + 1) * sizeof (char)); + memcpy (num, sp, numlen); + num[numlen] = '\0'; + ret = mpz_set_str (mpq_numref(r), num, base); + gmp_free_func (num, 0); + + if (ret != 0) + return ret; + + return mpz_set_str (mpq_denref(r), slash + 1, base); + } +} diff --git a/gcc/gmp/mini-gmp/mini-mpq.h b/gcc/gmp/mini-gmp/mini-mpq.h new file mode 100644 index 0000000..8eabcec 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/mini-mpq.h @@ -1,0 +1,114 @@ +/* mini-mpq, a minimalistic implementation of a GNU GMP subset. + +Copyright 2018, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* Header */ + +#ifndef __MINI_MPQ_H__ +#define __MINI_MPQ_H__ + +#include "mini-gmp.h" + +#if defined (__cplusplus) +extern "C" { +#endif + +typedef struct +{ + __mpz_struct _mp_num; + __mpz_struct _mp_den; +} __mpq_struct; + +typedef __mpq_struct mpq_t[1]; + +typedef const __mpq_struct *mpq_srcptr; +typedef __mpq_struct *mpq_ptr; + +#define mpq_numref(Q) (&((Q)->_mp_num)) +#define mpq_denref(Q) (&((Q)->_mp_den)) + +void mpq_abs (mpq_t, const mpq_t); +void mpq_add (mpq_t, const mpq_t, const mpq_t); +void mpq_canonicalize (mpq_t); +void mpq_clear (mpq_t); +int mpq_cmp (const mpq_t, const mpq_t); +int mpq_cmp_si (const mpq_t, signed long, unsigned long); +int mpq_cmp_ui (const mpq_t, unsigned long, unsigned long); +int mpq_cmp_z (const mpq_t, const mpz_t); +void mpq_div (mpq_t, const mpq_t, const mpq_t); +void mpq_div_2exp (mpq_t, const mpq_t, mp_bitcnt_t); +int mpq_equal (const mpq_t, const mpq_t); +double mpq_get_d (const mpq_t); +void mpq_get_den (mpz_t, const mpq_t); +void mpq_get_num (mpz_t, const mpq_t); +char * mpq_get_str (char *, int, const mpq_t q); +void mpq_init (mpq_t); +void mpq_inv (mpq_t, const mpq_t); +void mpq_mul (mpq_t, const mpq_t, const mpq_t); +void mpq_mul_2exp (mpq_t, const mpq_t, mp_bitcnt_t); +void mpq_neg (mpq_t, const mpq_t); +void mpq_set (mpq_t, const mpq_t); +void mpq_set_d (mpq_t, double); +void mpq_set_den (mpq_t, const mpz_t); +void mpq_set_num (mpq_t, const mpz_t); +void mpq_set_si (mpq_t, signed long, unsigned long); +int mpq_set_str (mpq_t, const char *, int); +void mpq_set_ui (mpq_t, unsigned long, unsigned long); +void mpq_set_z (mpq_t, const mpz_t); +int mpq_sgn (const mpq_t); +void mpq_sub (mpq_t, const mpq_t, const mpq_t); +void mpq_swap (mpq_t, mpq_t); + +/* This long list taken from gmp.h. */ +/* For reference, "defined(EOF)" cannot be used here. In g++ 2.95.4, + defines EOF but not FILE. */ +#if defined (FILE) \ + || defined (H_STDIO) \ + || defined (_H_STDIO) /* AIX */ \ + || defined (_STDIO_H) /* glibc, Sun, SCO */ \ + || defined (_STDIO_H_) /* BSD, OSF */ \ + || defined (__STDIO_H) /* Borland */ \ + || defined (__STDIO_H__) /* IRIX */ \ + || defined (_STDIO_INCLUDED) /* HPUX */ \ + || defined (__dj_include_stdio_h_) /* DJGPP */ \ + || defined (_FILE_DEFINED) /* Microsoft */ \ + || defined (__STDIO__) /* Apple MPW MrC */ \ + || defined (_MSL_STDIO_H) /* Metrowerks */ \ + || defined (_STDIO_H_INCLUDED) /* QNX4 */ \ + || defined (_ISO_STDIO_ISO_H) /* Sun C++ */ \ + || defined (__STDIO_LOADED) /* VMS */ +size_t mpq_out_str (FILE *, int, const mpq_t); +#endif + +void mpz_set_q (mpz_t, const mpq_t); + +#if defined (__cplusplus) +} +#endif +#endif /* __MINI_MPQ_H__ */ diff --git a/gcc/gmp/mpz/lucmod.c b/gcc/gmp/mpz/lucmod.c new file mode 100644 index 0000000..0dad48c 100644 --- /dev/null +++ b/gcc/gmp/mpz/lucmod.c @@ -1,0 +1,127 @@ +/* mpz_lucas_mod -- Helper function for the strong Lucas + primality test. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2018 Free Software Foundation, Inc. + +Contributed by Marco Bodrato. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes V_{k+1}, Q^{k+1} (mod n) for the Lucas' sequence */ +/* with P=1, Q=Q; k = n>>b0. */ +/* Requires n > 4; b0 > 0; -2*Q must not overflow a long. */ +/* If U_{k+1}==0 (mod n) or V_{k+1}==0 (mod n), it returns 1, */ +/* otherwise it returns 0 and sets V=V_{k+1} and Qk=Q^{k+1}. */ +/* V will never grow beyond SIZ(n), Qk not beyond 2*SIZ(n). */ +int +mpz_lucas_mod (mpz_ptr V, mpz_ptr Qk, long Q, + mp_bitcnt_t b0, mpz_srcptr n, mpz_ptr T1, mpz_ptr T2) +{ + mp_bitcnt_t bs; + int res; + + ASSERT (b0 > 0); + ASSERT (SIZ (n) > 1 || SIZ (n) > 0 && PTR (n) [0] > 4); + + mpz_set_ui (V, 1); /* U1 = 1 */ + bs = mpz_sizeinbase (n, 2) - 2; + if (UNLIKELY (bs < b0)) + { + /* n = 2^b0 - 1, should we use Lucas-Lehmer instead? */ + ASSERT (bs == b0 - 2); + mpz_set_si (Qk, Q); + return 0; + } + mpz_set_ui (Qk, 1); /* U2 = 1 */ + + do + { + /* We use the iteration suggested in "Elementary Number Theory" */ + /* by Peter Hackman (November 1, 2009), section "L.XVII Scalar */ + /* Formulas", from http://hackmat.se/kurser/TATM54/booktot.pdf */ + /* U_{2k} = 2*U_{k+1}*U_k - P*U_k^2 */ + /* U_{2k+1} = U_{k+1}^2 - Q*U_k^2 */ + /* U_{2k+2} = P*U_{k+1}^2 - 2*Q*U_{k+1}*U_k */ + /* We note that U_{2k+2} = P*U_{2k+1} - Q*U_{2k} */ + /* The formulas are specialized for P=1, and only squares: */ + /* U_{2k} = U_{k+1}^2 - |U_{k+1} - U_k|^2 */ + /* U_{2k+1} = U_{k+1}^2 - Q*U_k^2 */ + /* U_{2k+2} = U_{2k+1} - Q*U_{2k} */ + mpz_mul (T1, Qk, Qk); /* U_{k+1}^2 */ + mpz_sub (Qk, V, Qk); /* |U_{k+1} - U_k| */ + mpz_mul (T2, Qk, Qk); /* |U_{k+1} - U_k|^2 */ + mpz_mul (Qk, V, V); /* U_k^2 */ + mpz_sub (T2, T1, T2); /* U_{k+1}^2 - (U_{k+1} - U_k)^2 */ + if (Q > 0) /* U_{k+1}^2 - Q U_k^2 = U_{2k+1} */ + mpz_submul_ui (T1, Qk, Q); + else + mpz_addmul_ui (T1, Qk, NEG_CAST (unsigned long, Q)); + + /* A step k->k+1 is performed if the bit in $n$ is 1 */ + if (mpz_tstbit (n, bs)) + { + /* U_{2k+2} = U_{2k+1} - Q*U_{2k} */ + mpz_mul_si (T2, T2, Q); + mpz_sub (T2, T1, T2); + mpz_swap (T1, T2); + } + mpz_tdiv_r (Qk, T1, n); + mpz_tdiv_r (V, T2, n); + } while (--bs >= b0); + + res = SIZ (Qk) == 0; + if (!res) { + mpz_mul_si (T1, V, -2*Q); + mpz_add (T1, Qk, T1); /* V_k = U_k - 2Q*U_{k-1} */ + mpz_tdiv_r (V, T1, n); + res = SIZ (V) == 0; + if (!res && b0 > 1) { + /* V_k and Q^k will be needed for further check, compute them. */ + /* FIXME: Here we compute V_k^2 and store V_k, but the former */ + /* will be recomputed by the calling function, shoul we store */ + /* that instead? */ + mpz_mul (T2, T1, T1); /* V_k^2 */ + mpz_mul (T1, Qk, Qk); /* P^2 U_k^2 = U_k^2 */ + mpz_sub (T2, T2, T1); + ASSERT (SIZ (T2) == 0 || PTR (T2) [0] % 4 == 0); + mpz_tdiv_q_2exp (T2, T2, 2); /* (V_k^2 - P^2 U_k^2) / 4 */ + if (Q > 0) /* (V_k^2 - (P^2 -4Q) U_k^2) / 4 = Q^k */ + mpz_addmul_ui (T2, T1, Q); + else + mpz_submul_ui (T2, T1, NEG_CAST (unsigned long, Q)); + mpz_tdiv_r (Qk, T2, n); + } + } + + return res; +} diff --git a/gcc/gmp/mpz/stronglucas.c b/gcc/gmp/mpz/stronglucas.c new file mode 100644 index 0000000..350dd2a 100644 --- /dev/null +++ b/gcc/gmp/mpz/stronglucas.c @@ -1,0 +1,178 @@ +/* mpz_stronglucas(n, t1, t2) -- An implementation of the strong Lucas + primality test on n, using parameters as suggested by the BPSW test. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2018 Free Software Foundation, Inc. + +Contributed by Marco Bodrato. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Returns an approximation of the sqare root of x. + * It gives: + * limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2 + * or + * x <= limb_apprsqrt (x) ^ 2 <= x * 9/8 + */ +static mp_limb_t +limb_apprsqrt (mp_limb_t x) +{ + int s; + + ASSERT (x > 2); + count_leading_zeros (s, x); + s = (GMP_LIMB_BITS - s) >> 1; + return ((CNST_LIMB(1) << s) + (x >> s)) >> 1; +} + +/* Performs strong Lucas' test on x, with parameters suggested */ +/* for the BPSW test. Qk and V are passed to recycle variables. */ +/* Requires GCD (x,6) = 1.*/ +int +mpz_stronglucas (mpz_srcptr x, mpz_ptr V, mpz_ptr Qk) +{ + mp_bitcnt_t b0; + mpz_t n; + mp_limb_t D; /* The absolute value is stored. */ + long Q; + mpz_t T1, T2; + + /* Test on the absolute value. */ + mpz_roinit_n (n, PTR (x), ABSIZ (x)); + + ASSERT (mpz_odd_p (n)); + /* ASSERT (mpz_gcd_ui (NULL, n, 6) == 1); */ +#if GMP_NUMB_BITS % 16 == 0 + /* (2^12 - 1) | (2^{GMP_NUMB_BITS*3/4} - 1) */ + D = mpn_mod_34lsub1 (PTR (n), SIZ (n)); + /* (2^12 - 1) = 3^2 * 5 * 7 * 13 */ + ASSERT (D % 3 != 0 && D % 5 != 0 && D % 7 != 0); + if ((D % 5 & 2) != 0) + /* (5/n) = -1, iff n = 2 or 3 (mod 5) */ + /* D = 5; Q = -1 */ + return mpn_strongfibo (PTR (n), SIZ (n), PTR (V)); + else if (! POW2_P (D % 7)) + /* (-7/n) = -1, iff n = 3,5 or 6 (mod 7) */ + D = 7; /* Q = 2 */ + /* (9/n) = -1, never: 9 = 3^2 */ + else if (mpz_kronecker_ui (n, 11) == -1) + /* (-11/n) = (n/11) */ + D = 11; /* Q = 3 */ + else if ((((D % 13 - (D % 13 >> 3)) & 7) > 4) || + (((D % 13 - (D % 13 >> 3)) & 7) == 2)) + /* (13/n) = -1, iff n = 2,5,6,7,8 or 11 (mod 13) */ + D = 13; /* Q = -3 */ + else if (D % 3 == 2) + /* (-15/n) = (n/15) = (n/5)*(n/3) */ + /* Here, (n/5) = 1, and */ + /* (n/3) = -1, iff n = 2 (mod 3) */ + D = 15; /* Q = 4 */ +#if GMP_NUMB_BITS % 32 == 0 + /* (2^24 - 1) | (2^{GMP_NUMB_BITS*3/4} - 1) */ + /* (2^24 - 1) = (2^12 - 1) * 17 * 241 */ + else if (! POW2_P (D % 17) && ! POW2_P (17 - D % 17)) + D = 17; /* Q = -4 */ +#endif +#else + if (mpz_kronecker_ui (n, 5) == -1) + return mpn_strongfibo (PTR (n), SIZ (n), PTR (V)); +#endif + else + { + mp_limb_t tl; + mp_limb_t maxD; + int jac_bit1; + + if (UNLIKELY (mpz_perfect_square_p (n))) + return 0; /* A square is composite. */ + + /* Check Ds up to square root (in case, n is prime) + or avoid overflows */ + if (SIZ (n) == 1) + maxD = limb_apprsqrt (* PTR (n)); + else if (BITS_PER_ULONG >= GMP_NUMB_BITS && SIZ (n) == 2) + mpn_sqrtrem (&maxD, (mp_ptr) NULL, PTR (n), 2); + else + maxD = GMP_NUMB_MAX; + maxD = MIN (maxD, ULONG_MAX); + + D = GMP_NUMB_BITS % 16 == 0 ? (GMP_NUMB_BITS % 32 == 0 ? 17 : 15) : 5; + + /* Search a D such that (D/n) = -1 in the sequence 5,-7,9,-11,.. */ + /* For those Ds we have (D/n) = (n/|D|) */ + /* FIXME: Should we loop only on prime Ds? */ + /* The only interesting composite D is 15. */ + do + { + if (UNLIKELY (D >= maxD)) + return 1; + D += 2; + jac_bit1 = 0; + JACOBI_MOD_OR_MODEXACT_1_ODD (jac_bit1, tl, PTR (n), SIZ (n), D); + if (UNLIKELY (tl == 0)) + return 0; + } + while (mpn_jacobi_base (tl, D, jac_bit1) == 1); + } + + /* D= P^2 - 4Q; P = 1; Q = (1-D)/4 */ + Q = (D & 2) ? (D >> 2) + 1 : -(long) (D >> 2); + /* ASSERT (mpz_si_kronecker ((D & 2) ? NEG_CAST (long, D) : D, n) == -1); */ + + /* n-(D/n) = n+1 = d*2^{b0}, with d = (n>>b0) | 1 */ + b0 = mpz_scan0 (n, 0); + + mpz_init (T1); + mpz_init (T2); + + /* If Ud != 0 && Vd != 0 */ + if (mpz_lucas_mod (V, Qk, Q, b0, n, T1, T2) == 0) + if (LIKELY (--b0 != 0)) + do + { + /* V_{2k} <- V_k ^ 2 - 2Q^k */ + mpz_mul (T2, V, V); + mpz_submul_ui (T2, Qk, 2); + mpz_tdiv_r (V, T2, n); + if (SIZ (V) == 0 || UNLIKELY (--b0 == 0)) + break; + /* Q^{2k} = (Q^k)^2 */ + mpz_mul (T2, Qk, Qk); + mpz_tdiv_r (Qk, T2, n); + } while (1); + + mpz_clear (T1); + mpz_clear (T2); + + return (b0 != 0); +} diff --git a/gcc/gmp/tune/hgcd2-1.c b/gcc/gmp/tune/hgcd2-1.c new file mode 100644 index 0000000..1e8948c 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2-1.c @@ -1,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 1. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 1 +#define __gmpn_hgcd2 mpn_hgcd2_1 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_1 + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/tune/hgcd2-2.c b/gcc/gmp/tune/hgcd2-2.c new file mode 100644 index 0000000..bbb123b 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2-2.c @@ -1,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 2. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 2 +#define __gmpn_hgcd2 mpn_hgcd2_2 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_2 + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/tune/hgcd2-3.c b/gcc/gmp/tune/hgcd2-3.c new file mode 100644 index 0000000..ac62108 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2-3.c @@ -1,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 3. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 3 +#define __gmpn_hgcd2 mpn_hgcd2_3 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_3 + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/tune/hgcd2-4.c b/gcc/gmp/tune/hgcd2-4.c new file mode 100644 index 0000000..ec7f927 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2-4.c @@ -1,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 4. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 4 +#define __gmpn_hgcd2 mpn_hgcd2_4 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_4 + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/tune/hgcd2-5.c b/gcc/gmp/tune/hgcd2-5.c new file mode 100644 index 0000000..ed66171 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2-5.c @@ -1,0 +1,39 @@ +/* mpn/generic/hgcd2.c method 5. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 5 +#define __gmpn_hgcd2 mpn_hgcd2_5 +/* Not used, but renamed to not get duplicate definitions */ +#define __gmpn_hgcd_mul_matrix1_vector mpn_hgcd_mul_matrix1_vector_5 + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/tune/hgcd2.c b/gcc/gmp/tune/hgcd2.c new file mode 100644 index 0000000..146af72 100644 --- /dev/null +++ b/gcc/gmp/tune/hgcd2.c @@ -1,0 +1,49 @@ +/* mpn/generic/hgcd2.c for tuning + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define TUNE_PROGRAM_BUILD 1 + +#include "gmp-impl.h" + +hgcd2_func_t mpn_hgcd2_default; + +hgcd2_func_t *hgcd2_func = &mpn_hgcd2_default; + +int +mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M) +{ + return hgcd2_func(ah, al, bh, bl, M); +} + +#undef mpn_hgcd2 +#define mpn_hgcd2 mpn_hgcd2_default + +#include "mpn/generic/hgcd2.c" diff --git a/gcc/gmp/mini-gmp/tests/t-lucm.c b/gcc/gmp/mini-gmp/tests/t-lucm.c new file mode 100644 index 0000000..22ad575 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-lucm.c @@ -1,0 +1,98 @@ +/* Tests the (internal) function mpz_lucas_mod + +Copyright 2018, Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include + +#include "testutils.h" + +#define MAXBITS 100 +#define COUNT 1000 + +void +testmain (int argc, char **argv) +{ + unsigned i; + mpz_t m, vr, qr, vm, qm, vt; + int resm, resr; + long Q; + unsigned long b0; + + mpz_init (m); + mpz_init (vr); + mpz_init (qr); + mpz_init (vm); + mpz_init (qm); + mpz_init (vt); + + for (i = 0; i < COUNT; i++) + { + mini_random_lucm_op (MAXBITS, vr, qr, m, &Q, &b0, &resr); + if (b0 == 0) + { + fprintf (stderr, "lucas_mod: test disabled (%u tests done).\n", i); + break; + } + resm = mpz_lucas_mod (vm, qm, Q, b0, m); + + if (resr != resm) + { + if (resm != 0 || mpz_cmp_ui (vm, 0) != 0) + { + fprintf (stderr, "mpz_lucas_mod wrong return value (%d != %d):\n", resr, resm); + fprintf (stderr, "Q = %ld , b0 = %lu\n", Q, b0); + dump ("m", m); + dump ("vm", vm); + dump ("qm", qm); + abort (); + } + } + else if (resm == 0) + { + mpz_abs (vr, vr); + mpz_sub (vt, m, vr); + mpz_abs (vm, vm); + mpz_mod (qm, qm, m); + if (mpz_cmp_ui (qr, 0) < 0) + mpz_add (qr, qr, m); + if (mpz_cmp (qm, qr) != 0 || + (mpz_cmp (vm, vr) != 0 && mpz_cmp (vm, vt) != 0)) + { + fprintf (stderr, "mpz_lucas_mod error:\n"); + fprintf (stderr, "Q = %ld , b0 = %lu\n", Q, b0); + dump ("m", m); + dump ("vm", vm); + dump ("vr", vr); + dump ("vt", vt); + dump ("qm", qm); + dump ("qr", qr); + abort (); + } + + } + } + mpz_clear (m); + mpz_clear (vr); + mpz_clear (qr); + mpz_clear (vm); + mpz_clear (qm); + mpz_clear (vt); +} diff --git a/gcc/gmp/mini-gmp/tests/t-mpq_addsub.c b/gcc/gmp/mini-gmp/tests/t-mpq_addsub.c new file mode 100644 index 0000000..de1461f 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-mpq_addsub.c @@ -1,0 +1,204 @@ +/* + +Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include + +#include "testutils.h" +#include "../mini-mpq.h" + +#define MAXBITS 300 +#define COUNT 10000 + +static void +_mpq_set_zz (mpq_t q, mpz_t n, mpz_t d) +{ + if (mpz_fits_ulong_p (d) && mpz_fits_slong_p (n)) + { + mpq_set_si (q, mpz_get_si (n), mpz_get_ui (d)); + } + else if (mpz_fits_ulong_p (d) && mpz_fits_ulong_p (n)) + { + mpq_set_ui (q, mpz_get_ui (n), mpz_get_ui (d)); + } + else + { + mpq_set_num (q, n); + mpq_set_den (q, d); + } + mpq_canonicalize (q); +} + +void +testcmpui () +{ + unsigned d1, d2, n1, n2; + mpq_t q1, q2; + + mpq_init (q1); + mpq_init (q2); + + for (d1 = 1; d1 < 6; d1 += 2) + for (n1 = 1; n1 < 6; n1 *= 2) + { + mpq_set_ui (q1, n1, d1); + for (d2 = 1; d2 < 6; d2 += 2) + for (n2 = 1; n2 < 6; n2 *= 2) + { + int fres = mpq_cmp_ui (q1, n2, d2); + int ref = (d1*n2 < d2*n1) - (d1*n2 > d2*n1); + + mpq_set_ui (q2, n2, d2); + + if ((!ref) != mpq_equal (q1, q2)) + { + fprintf (stderr, "mpz_equal failed: %i / %i = %i / %i ? %i\n", n1, d1, n2, d2, ref); + abort (); + } + + if (ref != fres) + { + fprintf (stderr, "mpz_cmp_ui failed: %i / %i = %i / %i ? %i != %i\n", n1, d1, n2, d2, ref, fres); + abort (); + } + } + } + + mpq_clear (q1); + mpq_clear (q2); +} + +void +testmain (int argc, char **argv) +{ + unsigned i; + mpz_t a, b, q, r, c; + mpq_t rr, ii, ff; + int tst; + + testcmpui (); + mpz_init (a); + mpz_init (b); + mpz_init (r); + mpz_init (q); + mpz_init (c); + mpq_init (rr); + mpq_init (ff); + mpq_init (ii); + + for (i = 0; i < COUNT; i++) + { + mini_random_op4 (OP_TDIV, MAXBITS, a, b, q, r); + + _mpq_set_zz (rr, a, b); + _mpq_set_zz (ff, r, b); + + mpq_set_z (ii, q); + + mpz_set_q (c, rr); + if (mpz_cmp (c, q)) + { + fprintf (stderr, "mpz_set_q failed:\n"); + dump ("a", a); + dump ("b", b); + dump ("c", c); + dump ("q", q); + abort (); + } + + if ((mpz_sgn (r) != 0) ^ (mpz_cmp_ui (mpq_denref (rr), 1) != 0)) + { + fprintf (stderr, "mpq_canonicalize failed:\n"); + dump ("a", a); + dump ("b", b); + dump ("r", r); + dump ("D", mpq_denref (rr)); + abort (); + } + + if (i & 1) + { + if (mpz_fits_slong_p (q)) + tst = mpq_cmp_si (rr, mpz_get_si (q), 1); + else if (mpz_fits_ulong_p (q)) + tst = mpq_cmp_ui (rr, mpz_get_ui (q), 1); + else + tst = mpq_cmp_z (rr, q); + if (mpz_sgn (b) < 0) + tst = - tst; + if ((tst != mpz_sgn (r)) && ((tst < 0 && mpz_sgn (r) >= 0) || (tst > 0 && mpz_sgn (r) <= 0))) + { + fprintf (stderr, "mpq_cmp ii failed: %i %i\n", tst, mpz_sgn (r)); + dump ("a", a); + dump ("b", b); + dump ("r", r); + dump ("q", q); + abort (); + } + } + else + { + if (mpz_fits_ulong_p (b) && mpz_fits_slong_p (r)) + tst = mpq_cmp_si (rr, mpz_get_si (r), mpz_get_ui (b)); + else if (mpz_fits_ulong_p (b) && mpz_fits_ulong_p (r)) + tst = mpq_cmp_ui (rr, mpz_get_ui (r), mpz_get_ui (b)); + else + tst = mpq_cmp (rr, ff); + if ((tst != mpz_sgn (q)) && ((tst < 0 && mpz_sgn (q) >= 0) || (tst > 0 && mpz_sgn (q) <= 0))) + { + fprintf (stderr, "mpq_cmp ff failed: %i %i\n", tst, mpz_sgn (q)); + dump ("a", a); + dump ("b", b); + dump ("r", r); + dump ("q", q); + abort (); + } + } + + if (i & 1) + { + mpq_sub (rr, rr, ff); + } + else + { + mpq_neg (ff, ff); + mpq_add (rr, ff, rr); + } + + if (!mpq_equal (ii, rr)) + { + fprintf (stderr, "mpq_%s failed:\n", (i & 1) ? "sub" : "add"); + dump ("a", a); + dump ("b", b); + dump ("r", r); + dump ("q", q); + abort (); + } + } + + mpz_clear (a); + mpz_clear (b); + mpz_clear (r); + mpz_clear (q); + mpz_clear (c); + mpq_clear (rr); + mpq_clear (ff); + mpq_clear (ii); +} diff --git a/gcc/gmp/mini-gmp/tests/t-mpq_double.c b/gcc/gmp/mini-gmp/tests/t-mpq_double.c new file mode 100644 index 0000000..1bd6c92 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-mpq_double.c @@ -1,0 +1,214 @@ +/* Test mpq_set_d. + +Copyright 2001-2003, 2005, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include + +#include "testutils.h" +#include "../mini-mpq.h" + +#define COUNT 2000 + +mp_bitcnt_t +mpz_mantissasizeinbits (const mpz_t z) +{ + return ! mpz_cmp_ui (z, 0) ? 0 : + mpz_sizeinbase (z, 2) - mpz_scan1 (z, 0); +} + +int +mpz_abspow2_p (const mpz_t z) +{ + return mpz_mantissasizeinbits (z) == 1; +} + +mp_bitcnt_t +mpq_mantissasizeinbits (const mpq_t q) +{ + if (! mpz_abspow2_p (mpq_denref (q))) + return ~ (mp_bitcnt_t) 0; + + return mpz_mantissasizeinbits (mpq_numref (q)); +} + +#if defined(DBL_MANT_DIG) && FLT_RADIX == 2 +int +mpz_get_d_exact_p (const mpz_t z) +{ + return mpz_mantissasizeinbits (z) <= DBL_MANT_DIG; +} + +int +mpq_get_d_exact_p (const mpq_t q) +{ + /* return mpq_mantissasizeinbits (q) <= DBL_MANT_DIG; */ + return + (mpz_sizeinbase (mpq_denref (q), 2) - + mpz_scan1 (mpq_denref (q), 0) == 1) && + (mpz_sizeinbase (mpq_numref (q), 2) - + mpz_scan1 (mpq_numref (q), 0) <= DBL_MANT_DIG); + /* mpz_sizeinbase (zero, 2) - mpz_scan1 (zero, 0) == 2 */ +} +#define HAVE_EXACT_P 1 +#endif + +void +check_random (void) +{ + unsigned i; + mpz_t x; + mpq_t y, z; + + mpz_init (x); + mpq_init (y); + mpq_init (z); + + for (i = 0; i < COUNT; i++) + { + /* Use volatile, to avoid extended precision in floating point + registers, e.g., on m68k and 80387. */ + volatile double d, f; + unsigned long m; + int e, c; + + mini_rrandomb (x, CHAR_BIT * sizeof (unsigned long)); + m = mpz_get_ui (x); + mini_urandomb (x, 8); + e = mpz_get_ui (x) - 128; + + d = ldexp ((double) m, e); + mpq_set_d (y, d); + f = mpq_get_d (y); + if (f != d) + { + fprintf (stderr, "mpq_set_d/mpq_get_d failed:\n"); + goto dumperror; + } + + d = - d; + mpq_neg (y, y); + + mpq_set_d (z, d); + f = mpq_get_d (z); + if (f != d || !mpq_equal (y, z)) + { + fprintf (stderr, "mpq_set_d/mpq_get_d failed:\n"); + dumperror: + dump ("ny", mpq_numref (y)); + dump ("dy", mpq_denref (y)); + fprintf (stderr, "m = %lx, e = %i\n", m, e); + fprintf (stderr, "d = %.35g\n", d); + fprintf (stderr, "f = %.35g\n", f); + fprintf (stderr, "f - d = %.35g\n", f - d); + abort (); + } + + mini_rrandomb (x, CHAR_BIT * sizeof (unsigned long)); + m = mpz_get_ui (x); + mini_urandomb (x, 8); + e = mpz_get_ui (x) - 128; + + d = ldexp ((double) m, e); + mpq_set_d (y, d); + + mpq_add (y, y, z); + mpq_set_d (z, mpq_get_d (y)); + f = mpq_get_d (z); + c = mpq_cmp (y, z); + +#if defined(HAVE_EXACT_P) + if (mpq_get_d_exact_p (y) ? c != 0 : (f > 0 ? c <= 0 : c >= 0)) +#else + if (f > 0 ? c < 0 : c > 0) +#endif + { + fprintf (stderr, "mpq_get_d/mpq_set_d failed: %i %i\n", i, c); + goto dumperror; + } + } + + mpz_clear (x); + mpq_clear (y); + mpq_clear (z); +} + + +void +check_data (void) +{ + static const struct { + double y; + long int n; + unsigned long d; + } data[] = { + { 0.0, 0, 1 }, + { 1.0, 1, 1 }, + { -1.0, -1, 1 }, + { -1.5, -3, 2 }, + {-1.25, -5, 4 }, + {0.125, 1, 8 }, + + {24685,24685,1}, + {-9876,-9876,1}, + {463.5, 927,2}, + + {1234.5/8192, 2469, 16384 }, + {-543.0/1024, -543, 1024 }, + {9876.5/ 512, 19753, 1024 }, + {9753.0/ 128, 9753, 128 }, + {-789.0/ 32, -789, 32 }, + {4.580078125, 2345, 512 }, + }; + + mpq_t x, r; + unsigned i; + double d; + + mpq_init (x); + mpq_init (r); + + for (i = 0; i < numberof (data); i++) + { + mpq_set_d (x, data[i].y); + mpq_set_si (r, data[i].n, data[i].d); + mpq_canonicalize (r); + if (!mpq_equal (x, r)) + { + fprintf (stderr, "mpq_set_d failed: %li / %lu != %g\n", data[i].n, data[i].d, data[i].y); + abort (); + } + d = mpq_get_d (r); + if (d != data[i].y) + { + fprintf (stderr, "mpq_get_d failed: %li / %lu != %g\n", data[i].n, data[i].d, data[i].y); + abort (); + } + } + + mpq_clear (x); + mpq_clear (r); +} + +void +testmain (int argc, char *argv[]) +{ + check_data (); + check_random (); +} diff --git a/gcc/gmp/mini-gmp/tests/t-mpq_muldiv.c b/gcc/gmp/mini-gmp/tests/t-mpq_muldiv.c new file mode 100644 index 0000000..8e7de8b 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-mpq_muldiv.c @@ -1,0 +1,176 @@ +/* + +Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include + +#include "testutils.h" +#include "../mini-mpq.h" + +#define MAXBITS 300 +#define COUNT 10000 + +static void +_mpq_set_zz (mpq_t q, mpz_t n, mpz_t d) +{ + if (mpz_fits_ulong_p (d) && mpz_fits_slong_p (n)) + { + mpq_set_si (q, mpz_get_si (n), mpz_get_ui (d)); + } + else if (mpz_fits_ulong_p (d) && mpz_fits_ulong_p (n)) + { + mpq_set_ui (q, mpz_get_ui (n), mpz_get_ui (d)); + } + else + { + mpq_set_num (q, n); + mpq_set_den (q, d); + } + mpq_canonicalize (q); +} + +void +testmain (int argc, char **argv) +{ + unsigned i; + mpz_t an, bn, rn, ad, bd, rd; + mpq_t aq, bq, refq, resq; + + mpz_init (an); + mpz_init (bn); + mpz_init (rn); + mpz_init (ad); + mpz_init (bd); + mpz_init (rd); + mpq_init (aq); + mpq_init (bq); + mpq_init (refq); + mpq_init (resq); + + for (i = 0; i < COUNT; i++) + { + mini_random_op3 (OP_MUL, MAXBITS, an, bn, rn); + do { + mini_random_op3 (OP_MUL, MAXBITS, ad, bd, rd); + } while (mpz_sgn (rd) == 0); + + _mpq_set_zz (aq, an, ad); + _mpq_set_zz (bq, bn, bd); + _mpq_set_zz (refq, rn, rd); + + mpq_mul (resq, aq, bq); + if (!mpq_equal (resq, refq)) + { + fprintf (stderr, "mpq_mul failed [%i]:\n", i); + dump ("an", an); + dump ("ad", ad); + dump ("bn", bn); + dump ("bd", bd); + dump ("refn", rn); + dump ("refd", rd); + dump ("resn", mpq_numref (resq)); + dump ("resd", mpq_denref (resq)); + abort (); + } + + if (mpq_sgn (refq) != 0) + { + mpq_set_ui (resq, ~6, 8); + mpq_inv (aq, aq); + mpq_div (resq, aq, bq); + mpq_inv (resq, resq); + if (!mpq_equal (resq, refq)) + { + fprintf (stderr, "mpq_div failed [%i]:\n", i); + dump ("an", an); + dump ("ad", ad); + dump ("bn", bn); + dump ("bd", bd); + dump ("refn", rn); + dump ("refd", rd); + dump ("resn", mpq_numref (resq)); + dump ("resd", mpq_denref (resq)); + abort (); + } + + mpq_swap (bq, aq); + mpq_div (resq, aq, bq); + if (!mpq_equal (resq, refq)) + { + fprintf (stderr, "mpq_swap failed [%i]:\n", i); + dump ("an", an); + dump ("ad", ad); + dump ("bn", bn); + dump ("bd", bd); + dump ("refn", rn); + dump ("refd", rd); + dump ("resn", mpq_numref (resq)); + dump ("resd", mpq_denref (resq)); + abort (); + } + } + + mpq_set (resq, aq); + mpq_neg (bq, aq); + mpq_abs (refq, aq); + if (mpq_equal (refq, resq)) + mpq_add (resq, refq, bq); + else + mpq_add (resq, refq, resq); + mpq_set_ui (refq, 0, 1); + if (!mpq_equal (resq, refq)) + { + fprintf (stderr, "mpq_abs failed [%i]:\n", i); + dump ("an", an); + dump ("ad", ad); + dump ("resn", mpq_numref (resq)); + dump ("resd", mpq_denref (resq)); + abort (); + } + + mpq_mul (resq, aq, aq); + mpq_mul (refq, aq, bq); /* now bq = - aq */ + mpq_neg (refq, refq); + if (!mpq_equal (resq, refq)) + { + fprintf (stderr, "mpq_mul(sqr) failed [%i]:\n", i); + dump ("an", an); + dump ("ad", ad); + dump ("bn", bn); + dump ("bd", bd); + dump ("refn", rn); + dump ("refd", rd); + dump ("resn", mpq_numref (resq)); + dump ("resd", mpq_denref (resq)); + abort (); + } + } + + mpz_clear (an); + mpz_clear (bn); + mpz_clear (rn); + mpz_clear (ad); + mpz_clear (bd); + mpz_clear (rd); + mpq_clear (aq); + mpq_clear (bq); + mpq_clear (refq); + mpq_clear (resq); +} diff --git a/gcc/gmp/mini-gmp/tests/t-mpq_muldiv_2exp.c b/gcc/gmp/mini-gmp/tests/t-mpq_muldiv_2exp.c new file mode 100644 index 0000000..46b2c0c 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-mpq_muldiv_2exp.c @@ -1,0 +1,138 @@ +/* + +Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include + +#include "testutils.h" +#include "../mini-mpq.h" + +#define MAXBITS 300 +#define COUNT 10000 + +static void +_mpq_set_zz (mpq_t q, mpz_t n, mpz_t d) +{ + if (mpz_fits_ulong_p (d) && mpz_fits_slong_p (n)) + { + mpq_set_si (q, mpz_get_si (n), mpz_get_ui (d)); + } + else if (mpz_fits_ulong_p (d) && mpz_fits_ulong_p (n)) + { + mpq_set_ui (q, mpz_get_ui (n), mpz_get_ui (d)); + } + else + { + mpq_set_num (q, n); + mpq_set_den (q, d); + } + mpq_canonicalize (q); +} + +void +testmain (int argc, char **argv) +{ + unsigned i; + mpz_t a, b, t; + mpq_t aq, rq, tq; + mp_bitcnt_t e; + long int e2, t1, t2; + + mpz_init (a); + mpz_init (b); + mpz_init (t); + mpq_init (aq); + mpq_init (rq); + mpq_init (tq); + + for (i = 0; i < COUNT; i++) + { + do { + mini_random_bit_op (OP_COMBIT, MAXBITS, a, &e, b); + } while (mpz_sgn (a) == 0 || mpz_sgn (b) == 0); + + _mpq_set_zz (aq, a, b); + e2 = mpz_scan1 (a, 0); + e2-= mpz_scan1 (b, 0); + + mpq_mul_2exp (rq, aq, e); + t1 = mpz_scan1 (mpq_numref (rq), 0); + t2 = mpz_scan1 (mpq_denref (rq), 0); + mpq_neg (tq, rq); + mpq_div (tq, aq, tq); + mpq_get_den (t, tq); + + if (e2 + e != t1 - t2 || (t2 != 0 && t1 != 0) || mpz_scan1 (t, 0) != e + || mpz_sizeinbase (t, 2) - 1 != e || mpz_cmp_si (mpq_numref (tq), -1) != 0) + { + fprintf (stderr, "mpq_mul_2exp failed: %lu\n", e); + dump ("na", a); + dump ("da", b); + dump ("nr", mpq_numref (rq)); + dump ("dr", mpq_denref (rq)); + abort (); + } + + mpq_div_2exp (rq, aq, e); + t1 = mpz_scan1 (mpq_numref (rq), 0); + t2 = mpz_scan1 (mpq_denref (rq), 0); + mpq_div (aq, aq, rq); + mpq_get_num (t, aq); + + if (e2 != t1 - t2 + e || (t2 != 0 && t1 != 0) || mpz_scan1 (t, 0) != e + || mpz_sizeinbase (t, 2) - 1 != e || mpz_cmp_ui (mpq_denref (aq), 1) != 0) + { + fprintf (stderr, "mpq_div_2exp failed: %lu\n", e); + fprintf (stderr, "%li %li %lu %zu\n", e2, t2, mpz_scan1 (t, 0), mpz_sizeinbase (t, 2)); + dump ("na", a); + dump ("da", b); + dump ("nr", mpq_numref (rq)); + dump ("dr", mpq_denref (rq)); + abort (); + } + + mpq_set_ui (aq, 0, 1); + mpq_set_ui (rq, 6, 7); + mpq_set (tq, aq); + mpq_div_2exp (rq, aq, e); + + if (!mpq_equal (tq, rq)) + { + fprintf (stderr, "mpq_div_2exp failed on zero: %lu\n", e); + abort (); + } + + mpq_set_ui (rq, 7, 6); + mpq_mul_2exp (rq, aq, e); + + if (!mpq_equal (rq, tq)) + { + fprintf (stderr, "mpq_mul_2exp failed on zero: %lu\n", e); + abort (); + } + } + + mpz_clear (a); + mpz_clear (b); + mpz_clear (t); + mpq_clear (aq); + mpq_clear (rq); + mpq_clear (tq); +} diff --git a/gcc/gmp/mini-gmp/tests/t-mpq_str.c b/gcc/gmp/mini-gmp/tests/t-mpq_str.c new file mode 100644 index 0000000..7c69153 100644 --- /dev/null +++ b/gcc/gmp/mini-gmp/tests/t-mpq_str.c @@ -1,0 +1,252 @@ +/* + +Copyright 2012-2014, 2016, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include +#include + +#include "testutils.h" +#include "../mini-mpq.h" + +#define MAXBITS 400 +#define COUNT 2000 + +#define GMP_LIMB_BITS (sizeof(mp_limb_t) * CHAR_BIT) +#define MAXLIMBS ((MAXBITS + GMP_LIMB_BITS - 1) / GMP_LIMB_BITS) + +static void +test_small (void) +{ + struct { + const char *input; + const char *decimal; + } data[] = { + { "1832407/3", "1832407/3" }, + { " 2763959/6", "2763959/6 " }, + { "4 981 999 / 1 8", "4981999/18" }, + { "10\t73981/30 ", "1073981/30" }, + { "958 544 /1", "00958544/01" }, + { "-0", "0000" }, + { " -000 ", "0/ 1" }, + { "0704436/011", "231710/9" }, + /* Check the case of large number of leading zeros. */ + { "0000000000000000000000000/1", "0/0000000000000000000000001" }, + { "000000000000000704436/000011", "0000000000000000231710/00009" }, + { " 012/ 02503517", "10/689999" }, + { "0b 10/0 1312143", "2/365667" }, + { "-03 274062/0x1", "-882738/1" }, + { "012\t242", "005282" }, + { "9/0b11010111110010001111", "9/883855" }, + { "022/ 0b11001010010100001", "18/103585" }, + { "-0b101010110011101111/0x12", "-175343/18" }, + { "-05/0b 111 1111 0110 1110 0110", "-5/521958" }, + { "0b 011 111 110 111 001 000 011/0b00110", "1044035/6" }, + { " 0x53dfc", "343548" }, + { "-0x00012/0x000fA019", "-18/1024025" }, + { "0x 642d1", "410321" }, + { "0x5 8067/0Xa", "360551/10" }, + { "-0xd6Be6/3", "-879590/3" }, + { "\t0B1110000100000000011", "460803" }, + { "0B\t1111110010010100101", "517285" }, + { "-0x 00 2d/0B1\t010111101101110100", "-45/359284" }, + { "-0B101\t1001101111111001", "-367609" }, + { "0B10001001010111110000/0xf", "562672/15" }, + { "0Xe4B7e/1", "936830" }, + { "0X1E4bf/0X1", "124095" }, + { "-0Xfdb90/05", "-1039248/5" }, + { "0b010/0X7fc47", "2/523335" }, + { "15/0X8167c", "15/530044" }, + /* Some invalid inputs */ + { "", NULL }, + { "0x", NULL }, + { "0b", NULL }, + { "0z", NULL }, + { "-", NULL }, + { "/0x ", NULL }, + { "0|1", NULL }, + { "/", NULL }, + { "0ab", NULL }, + { "10x0", NULL }, + { "1/0xxab", NULL }, + { "0/ab", NULL }, + { "0/#", NULL }, + { "$foo/1", NULL }, + { NULL, NULL } + }; + unsigned i; + mpq_t a, b; + mpq_init (a); + mpq_init (b); + + for (i = 0; data[i].input; i++) + { + int res = mpq_set_str (a, data[i].input, 0); + if (data[i].decimal) + { + if (res != 0) + { + fprintf (stderr, "mpq_set_str returned -1, input: %s\n", + data[i].input); + abort (); + } + if (mpq_set_str (b, data[i].decimal, 10) != 0) + { + fprintf (stderr, "mpq_set_str returned -1, decimal input: %s\n", + data[i].input); + abort (); + } + if (!mpq_equal (a, b)) + { + fprintf (stderr, "mpq_set_str failed for input: %s\n", + data[i].input); + + dump ("got_num", mpq_numref (a)); + dump ("got_den", mpq_denref (a)); + dump ("ref_num", mpq_numref (b)); + dump ("ref_den", mpq_denref (b)); + abort (); + } + } + else if (res != -1) + { + fprintf (stderr, "mpq_set_str returned %d, invalid input: %s\n", + res, data[i].input); + abort (); + } + } + + mpq_clear (a); + mpq_clear (b); +} + +void +testmain (int argc, char **argv) +{ + unsigned i; + char *ap; + char *bp; + char *rp; + size_t rn, arn; + + mpq_t a, b; + + FILE *tmp; + + test_small (); + + mpq_init (a); + mpq_init (b); + + tmp = tmpfile (); + if (!tmp) + fprintf (stderr, + "Failed to create temporary file. Skipping mpq_out_str tests.\n"); + + for (i = 0; i < COUNT/60; i++) + { + int base; + for (base = 2; base <= 62; ++base) + { + hex_mpq_random_str_op (MAXBITS, (i&1 || base > 36) ? base: -base, &ap, &rp); + if (mpq_set_str (a, ap, 16) != 0) + { + fprintf (stderr, "mpq_set_str failed on input %s\n", ap); + abort (); + } + + rn = strlen (rp); + arn = rn - (rp[0] == '-'); + + bp = mpq_get_str (NULL, (i&1 || base > 36) ? base: -base, a); + if (strcmp (bp, rp)) + { + fprintf (stderr, "mpz_get_str failed:\n"); + dump ("a_num", mpq_numref (a)); + dump ("a_den", mpq_denref (a)); + fprintf (stderr, "b = %s\n", bp); + fprintf (stderr, " base = %d\n", base); + fprintf (stderr, "r = %s\n", rp); + abort (); + } + + /* Just a few tests with file i/o. */ + if (tmp && i < 20) + { + size_t tn; + rewind (tmp); + tn = mpq_out_str (tmp, (i&1 || base > 36) ? base: -base, a); + if (tn != rn) + { + fprintf (stderr, "mpq_out_str, bad return value:\n"); + dump ("a_num", mpq_numref (a)); + dump ("a_den", mpq_denref (a)); + fprintf (stderr, "r = %s\n", rp); + fprintf (stderr, " base %d, correct size %u, got %u\n", + base, (unsigned) rn, (unsigned)tn); + abort (); + } + rewind (tmp); + memset (bp, 0, rn); + tn = fread (bp, 1, rn, tmp); + if (tn != rn) + { + fprintf (stderr, + "fread failed, expected %lu bytes, got only %lu.\n", + (unsigned long) rn, (unsigned long) tn); + abort (); + } + + if (memcmp (bp, rp, rn) != 0) + { + fprintf (stderr, "mpq_out_str failed:\n"); + dump ("a_num", mpq_numref (a)); + dump ("a_den", mpq_denref (a)); + fprintf (stderr, "b = %s\n", bp); + fprintf (stderr, " base = %d\n", base); + fprintf (stderr, "r = %s\n", rp); + abort (); + } + } + + mpq_set_str (b, rp, base); + + if (!mpq_equal (a, b)) + { + fprintf (stderr, "mpq_set_str failed:\n"); + fprintf (stderr, "r = %s\n", rp); + fprintf (stderr, " base = %d\n", base); + fprintf (stderr, "r = %s\n", ap); + fprintf (stderr, " base = 16\n"); + dump ("b_num", mpq_numref (b)); + dump ("b_den", mpq_denref (b)); + dump ("r_num", mpq_numref (a)); + dump ("r_den", mpq_denref (a)); + abort (); + } + + free (ap); + free (rp); + testfree (bp); + } + } + mpq_clear (a); + mpq_clear (b); +} diff --git a/gcc/gmp/mpn/arm/bdiv_q_1.asm b/gcc/gmp/mpn/arm/bdiv_q_1.asm new file mode 100644 index 0000000..ae395d1 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/bdiv_q_1.asm @@ -1,0 +1,162 @@ +dnl ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 13 18 +C Cortex-A5 8 12 +C Cortex-A7 10.5 18 +C Cortex-A8 14 15 +C Cortex-A9 10 12 not measured since latest edits +C Cortex-A15 9 9 +C Cortex-A53 14 20 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r8') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + tst d, #1 + push {r6-r11} + mov cnt, #0 + bne L(inv) + +C count trailing zeros + movs r10, d, lsl #16 + moveq d, d, lsr #16 + moveq cnt, #16 + tst d, #0xff + moveq d, d, lsr #8 + addeq cnt, cnt, #8 + LEA( r10, ctz_tab) + and r11, d, #0xff + ldrb r10, [r10, r11] + mov d, d, lsr r10 + add cnt, cnt, r10 + +C binvert limb +L(inv): LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + +L(pi1): ldr r11, [up], #4 C up[0] + cmp cnt, #0 + mov cy, #0 + bne L(unorm) + +L(norm): + subs n, n, #1 C set carry as side-effect + beq L(edn) + + ALIGN(16) +L(tpn): sbcs cy, r11, cy + ldr r11, [up], #4 + sub n, n, #1 + mul r9, r10, cy + tst n, n + umull r12, cy, d, r9 + str r9, [rp], #4 + bne L(tpn) + +L(edn): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 + +L(unorm): + rsb tnc, cnt, #32 + mov r11, r11, lsr cnt + subs n, n, #1 C set carry as side-effect + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + sbcs cy, r9, cy C critical path ->cy->cy-> + sub n, n, #1 + mul r9, r10, cy C critical path ->cy->r9-> + tst n, n + umull r12, cy, d, r9 C critical path ->r9->cy-> + str r9, [rp], #4 + bne L(tpu) + +L(edu): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 +EPILOGUE() + + RODATA +ctz_tab: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff --git a/gcc/gmp/mpn/arm64/aorsorrlsh1_n.asm b/gcc/gmp/mpn/arm64/aorsorrlsh1_n.asm new file mode 100644 index 0000000..c617a67 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/aorsorrlsh1_n.asm @@ -1,0 +1,43 @@ +dnl ARM64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`arm64/aorsorrlshC_n.asm') diff --git a/gcc/gmp/mpn/arm64/aorsorrlsh2_n.asm b/gcc/gmp/mpn/arm64/aorsorrlsh2_n.asm new file mode 100644 index 0000000..852d117 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/aorsorrlsh2_n.asm @@ -1,0 +1,43 @@ +dnl ARM64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`arm64/aorsorrlshC_n.asm') diff --git a/gcc/gmp/mpn/arm64/aorsorrlshC_n.asm b/gcc/gmp/mpn/arm64/aorsorrlshC_n.asm new file mode 100644 index 0000000..168caad 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/aorsorrlshC_n.asm @@ -1,0 +1,139 @@ +dnl ARM64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 3.25-3.75 +C Cortex-A57 2.18 +C X-Gene 2.5 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +ifdef(`DO_add', ` + define(`ADDSUB', `adds $1, $2, $3') + define(`ADDSUBC', `adcs $1, $2, $3') + define(`CLRRCY', `adds $1, xzr, xzr') + define(`RETVAL', `adc x0, $1, xzr') + define(`func_n', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADDSUB', `subs $1, $3, $2') + define(`ADDSUBC', `sbcs $1, $3, $2') + define(`CLRRCY', `subs $1, xzr, xzr') + define(`RETVAL', `cinc x0, $1, cc') + define(`func_n', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADDSUB', `subs $1, $2, $3') + define(`ADDSUBC', `sbcs $1, $2, $3') + define(`CLRRCY', `subs $1, xzr, xzr') + define(`RETVAL', `sbc x0, $1, xzr') + define(`func_n', mpn_rsblsh`'LSH`'_n)') + +ASM_START() +PROLOGUE(func_n) + lsr x18, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x5, [up] + tbnz n, #1, L(b11) + +L(b01): ldr x11, [vp] + cbz x18, L(1) + ldp x8, x9, [vp,#8] + lsl x13, x11, #LSH + ADDSUB( x15, x13, x5) + str x15, [rp],#8 + sub up, up, #24 + sub vp, vp, #8 + b L(mid) + +L(1): lsl x13, x11, #LSH + ADDSUB( x15, x13, x5) + str x15, [rp] + lsr x0, x11, RSH + RETVAL( x0, x1) + ret + +L(b11): ldr x9, [vp] + ldp x10, x11, [vp,#8]! + lsl x13, x9, #LSH + ADDSUB( x17, x13, x5) + str x17, [rp],#8 + sub up, up, #8 + cbz x18, L(end) + b L(top) + +L(bx0): tbnz n, #1, L(b10) + +L(b00): CLRRCY( x11) + ldp x8, x9, [vp],#-16 + sub up, up, #32 + b L(mid) + +L(b10): CLRRCY( x9) + ldp x10, x11, [vp] + sub up, up, #16 + cbz x18, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#16] + extr x12, x10, x9, #RSH + ldp x8, x9, [vp,#16] + extr x13, x11, x10, #RSH + ADDSUBC(x14, x12, x4) + ADDSUBC(x15, x13, x5) + stp x14, x15, [rp],#16 +L(mid): ldp x4, x5, [up,#32]! + extr x12, x8, x11, #RSH + ldp x10, x11, [vp,#32]! + extr x13, x9, x8, #RSH + ADDSUBC(x16, x12, x4) + ADDSUBC(x17, x13, x5) + stp x16, x17, [rp],#16 + sub x18, x18, #1 + cbnz x18, L(top) + +L(end): ldp x4, x5, [up,#16] + extr x12, x10, x9, #RSH + extr x13, x11, x10, #RSH + ADDSUBC(x14, x12, x4) + ADDSUBC(x15, x13, x5) + stp x14, x15, [rp] + lsr x0, x11, RSH + RETVAL( x0, x1) + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/bdiv_q_1.asm b/gcc/gmp/mpn/arm64/bdiv_q_1.asm new file mode 100644 index 0000000..2e189b8 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/bdiv_q_1.asm @@ -1,0 +1,128 @@ +dnl ARM64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C Cortex-A53 12 15 +C Cortex-A57 12 12 +C Cortex-A72 +C Cortex-A73 +C X-Gene 11 11 + +C TODO +C * Scheduling of umulh later in the unorm loop brings A53 time to 12 c/l. +C Unfortunately, that requires software pipelining. + +define(`rp', `x0') +define(`up', `x1') +define(`n', `x2') +define(`d', `x3') +define(`di', `x4') C just mpn_pi1_bdiv_q_1 +define(`cnt', `x5') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`tnc', `x8') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + + rbit x6, d + clz cnt, x6 + lsr d, d, cnt + +ifdef(`PIC',` + adrp x7, :got:__gmp_binvert_limb_table + ubfx x6, d, 1, 7 + ldr x7, [x7, #:got_lo12:__gmp_binvert_limb_table] +',` + adrp x7, __gmp_binvert_limb_table + ubfx x6, d, 1, 7 + add x7, x7, :lo12:__gmp_binvert_limb_table +') + ldrb w6, [x7, x6] + ubfiz x7, x6, 1, 8 + umull x6, w6, w6 + msub x6, x6, d, x7 + lsl x7, x6, 1 + mul x6, x6, x6 + msub x6, x6, d, x7 + lsl x7, x6, 1 + mul x6, x6, x6 + msub di, x6, d, x7 + + b mpn_pi1_bdiv_q_1 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + sub n, n, #1 + subs x6, x6, x6 C clear r6 and C flag + ldr x9, [up],#8 + cbz cnt, L(norm) + +L(unorm): + lsr x12, x9, cnt + cbz n, L(eu1) + sub tnc, xzr, cnt + +L(tpu): ldr x9, [up],#8 + lsl x7, x9, tnc + orr x7, x7, x12 + sbcs x6, x7, x6 + mul x7, x6, di + str x7, [rp],#8 + lsr x12, x9, cnt + umulh x6, x7, d + sub n, n, #1 + cbnz n, L(tpu) + +L(eu1): sbcs x6, x12, x6 + mul x6, x6, di + str x6, [rp] + ret + +L(norm): + mul x5, x9, di + str x5, [rp],#8 + cbz n, L(en1) + +L(tpn): ldr x9, [up],#8 + umulh x5, x5, d + sbcs x5, x9, x5 + mul x5, x5, di + str x5, [rp],#8 + sub n, n, #1 + cbnz n, L(tpn) + +L(en1): ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/gcd_11.asm b/gcc/gmp/mpn/arm64/gcd_11.asm new file mode 100644 index 0000000..d8cc3e2 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/gcd_11.asm @@ -1,0 +1,70 @@ +dnl ARM v8a mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +changecom(blah) + +C cycles/bit (approx) +C Cortex-A35 ? +C Cortex-A53 ? +C Cortex-A55 ? +C Cortex-A57 ? +C Cortex-A72 ? +C Cortex-A73 ? +C Cortex-A75 ? +C Cortex-A76 ? +C Cortex-A77 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `x0') +define(`v0', `x1') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + subs x3, u0, v0 C 0 + b.eq L(end) C + + ALIGN(16) +L(top): rbit x12, x3 C 1,5 + clz x12, x12 C 2 + csneg x3, x3, x3, cs C v = abs(u-v), even 1 + csel u0, v0, u0, cs C u = min(u,v) 1 + lsr v0, x3, x12 C 3 + subs x3, u0, v0 C 4 + b.ne L(top) C + +L(end): ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/gcd_22.asm b/gcc/gmp/mpn/arm64/gcd_22.asm new file mode 100644 index 0000000..5367fea 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/gcd_22.asm @@ -1,0 +1,112 @@ +dnl ARM v8a mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +changecom(blah) + +C cycles/bit (approx) +C Cortex-A35 ? +C Cortex-A53 7.26 +C Cortex-A55 ? +C Cortex-A57 ? +C Cortex-A72 5.72 +C Cortex-A73 6.43 +C Cortex-A75 ? +C Cortex-A76 ? +C Cortex-A77 ? + + +define(`u1', `x0') +define(`u0', `x1') +define(`v1', `x2') +define(`v0', `x3') + +define(`t0', `x5') +define(`t1', `x6') +define(`cnt', `x7') +define(`tnc', `x8') + +ASM_START() +PROLOGUE(mpn_gcd_22) + + ALIGN(16) +L(top): subs t0, u0, v0 C 0 6 + cbz t0, L(lowz) + sbcs t1, u1, v1 C 1 7 + + rbit cnt, t0 C 1 + + cneg t0, t0, cc C 2 + cinv t1, t1, cc C 2 u = |u - v| +L(bck): csel v0, v0, u0, cs C 2 + csel v1, v1, u1, cs C 2 v = min(u,v) + + clz cnt, cnt C 2 + sub tnc, xzr, cnt C 3 + + lsr u0, t0, cnt C 3 + lsl x14, t1, tnc C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, x14 C 5 + + orr x11, u1, v1 + cbnz x11, L(top) + + + subs x4, u0, v0 C 0 + b.eq L(end1) C + + ALIGN(16) +L(top1):rbit x12, x4 C 1,5 + clz x12, x12 C 2 + csneg x4, x4, x4, cs C v = abs(u-v), even 1 + csel u0, v0, u0, cs C u = min(u,v) 1 + lsr v0, x4, x12 C 3 + subs x4, u0, v0 C 4 + b.ne L(top1) C +L(end1):mov x0, u0 + mov x1, #0 + ret + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + b.eq L(end) + mov t1, #0 + rbit cnt, t0 C 1 + cneg t0, t0, cc C 2 + b L(bck) C FIXME: make conditional + +L(end): mov x0, v0 + mov x1, v1 + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/lshiftc.asm b/gcc/gmp/mpn/arm64/lshiftc.asm new file mode 100644 index 0000000..dd4c4ce 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/lshiftc.asm @@ -1,0 +1,130 @@ +dnl ARM64 mpn_lshiftc. + +dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.5-4.0 3.25 +C Cortex-A57 2.0 2.0 +C X-Gene 2.67 2.5 + +C TODO +C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These +C numbers should be 1 and 0, respectively. The str in wind-down should also +C go. +C * Using extr and with 63 separate loops we might reach 1.5 c/l on A57. +C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. + +changecom(blah) + +define(`rp_arg', `x0') +define(`up', `x1') +define(`n', `x2') +define(`cnt', `x3') + +define(`rp', `x16') + +define(`tnc',`x8') + +define(`PSHIFT', lsl) +define(`NSHIFT', lsr) + +ASM_START() +PROLOGUE(mpn_lshiftc) + add rp, rp_arg, n, lsl #3 + add up, up, n, lsl #3 + sub tnc, xzr, cnt + lsr x18, n, #2 + tbz n, #0, L(bx0) + +L(bx1): ldr x4, [up,#-8] + tbnz n, #1, L(b11) + +L(b01): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + cbnz x18, L(gt1) + mvn x2, x2 + str x2, [rp,#-8] + ret +L(gt1): ldp x4, x5, [up,#-24] + sub up, up, #8 + add rp, rp, #16 + b L(lo2) + +L(b11): NSHIFT x0, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-24]! + b L(lo3) + +L(bx0): ldp x4, x5, [up,#-16] + tbz n, #1, L(b00) + +L(b10): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + cbnz x18, L(gt2) + eon x10, x10, x13 + mvn x2, x2 + stp x2, x10, [rp,#-16] + ret +L(gt2): ldp x4, x5, [up,#-32] + eon x10, x10, x13 + str x10, [rp,#-8] + sub up, up, #16 + add rp, rp, #8 + b L(lo2) + +L(b00): NSHIFT x0, x5, tnc + PSHIFT x13, x5, cnt + NSHIFT x10, x4, tnc + PSHIFT x2, x4, cnt + ldp x6, x7, [up,#-32]! + eon x10, x10, x13 + str x10, [rp,#-8]! + b L(lo0) + + ALIGN(16) +L(top): ldp x4, x5, [up,#-16] + eon x10, x10, x13 + eon x11, x12, x2 + stp x10, x11, [rp,#-16] + PSHIFT x2, x6, cnt +L(lo2): NSHIFT x10, x4, tnc + PSHIFT x13, x5, cnt + NSHIFT x12, x5, tnc + ldp x6, x7, [up,#-32]! + eon x10, x10, x13 + eon x11, x12, x2 + stp x10, x11, [rp,#-32]! + PSHIFT x2, x4, cnt +L(lo0): sub x18, x18, #1 +L(lo3): NSHIFT x10, x6, tnc + PSHIFT x13, x7, cnt + NSHIFT x12, x7, tnc + cbnz x18, L(top) + +L(end): eon x10, x10, x13 + eon x11, x12, x2 + PSHIFT x2, x6, cnt + stp x10, x11, [rp,#-16] + mvn x2, x2 + str x2, [rp,#-24] + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/rsh1aors_n.asm b/gcc/gmp/mpn/arm64/rsh1aors_n.asm new file mode 100644 index 0000000..e0b760b 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/rsh1aors_n.asm @@ -1,0 +1,168 @@ +dnl ARM64 mpn_rsh1add_n and mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb assumed optimal c/l +C Cortex-A53 3.25-3.75 3.0 steady +C Cortex-A57 2.15 1.75 +C X-Gene 2.75 2.5 + +changecom(blah) + +define(`rp', `x0') +define(`up', `x1') +define(`vp', `x2') +define(`n', `x3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`COND', `cs') + define(`func_n', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`COND', `cc') + define(`func_n', mpn_rsh1sub_n)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func_n) + lsr x18, n, #2 + + tbz n, #0, L(bx0) + +L(bx1): ldr x5, [up],#8 + ldr x9, [vp],#8 + tbnz n, #1, L(b11) + +L(b01): ADDSUB x13, x5, x9 + and x10, x13, #1 + cbz x18, L(1) + ldp x4, x5, [up],#48 + ldp x8, x9, [vp],#48 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + ldp x4, x5, [up,#-32] + ldp x8, x9, [vp,#-32] + extr x17, x14, x13, #1 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + str x17, [rp], #24 + sub x18, x18, #1 + cbz x18, L(end) + b L(top) + +L(1): cset x14, COND + extr x17, x14, x13, #1 + str x17, [rp] + mov x0, x10 + ret + +L(b11): ADDSUB x15, x5, x9 + and x10, x15, #1 + + ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + cbz x18, L(3) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + extr x17, x12, x15, #1 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + str x17, [rp], #8 + b L(mid) + +L(3): extr x17, x12, x15, #1 + str x17, [rp], #8 + b L(2) + +L(bx0): tbz n, #1, L(b00) + +L(b10): ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + ADDSUB x12, x4, x8 + ADDSUBC x13, x5, x9 + and x10, x12, #1 + cbz x18, L(2) + ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + b L(mid) + +L(b00): ldp x4, x5, [up],#48 + ldp x8, x9, [vp],#48 + ADDSUB x14, x4, x8 + ADDSUBC x15, x5, x9 + and x10, x14, #1 + ldp x4, x5, [up,#-32] + ldp x8, x9, [vp,#-32] + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + add rp, rp, #16 + sub x18, x18, #1 + cbz x18, L(end) + + ALIGN(16) +L(top): ldp x4, x5, [up,#-16] + ldp x8, x9, [vp,#-16] + extr x16, x15, x14, #1 + extr x17, x12, x15, #1 + ADDSUBC x14, x4, x8 + ADDSUBC x15, x5, x9 + stp x16, x17, [rp,#-16] +L(mid): ldp x4, x5, [up],#32 + ldp x8, x9, [vp],#32 + extr x16, x13, x12, #1 + extr x17, x14, x13, #1 + ADDSUBC x12, x4, x8 + ADDSUBC x13, x5, x9 + stp x16, x17, [rp],#32 + sub x18, x18, #1 + cbnz x18, L(top) + +L(end): extr x16, x15, x14, #1 + extr x17, x12, x15, #1 + stp x16, x17, [rp,#-16] +L(2): cset x14, COND + extr x16, x13, x12, #1 + extr x17, x14, x13, #1 + stp x16, x17, [rp] + +L(ret): mov x0, x10 + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/sqr_diag_addlsh1.asm b/gcc/gmp/mpn/arm64/sqr_diag_addlsh1.asm new file mode 100644 index 0000000..55b5ac7 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/sqr_diag_addlsh1.asm @@ -1,0 +1,102 @@ +dnl ARM64 mpn_sqr_diag_addlsh1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2016, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 5.65 +C Cortex-A57 3.5 +C X-Gene 3.38 + +changecom(blah) + +define(`rp', `x0') +define(`tp', `x1') +define(`up', `x2') +define(`n', `x3') + +ASM_START() +PROLOGUE(mpn_sqr_diag_addlsh1) + ldr x15, [up],#8 + lsr x18, n, #1 + tbz n, #0, L(bx0) + +L(bx1): adds x7, xzr, xzr + mul x12, x15, x15 + ldr x16, [up],#8 + ldp x4, x5, [tp],#16 + umulh x11, x15, x15 + b L(mid) + +L(bx0): adds x5, xzr, xzr + mul x12, x15, x15 + ldr x17, [up],#16 + ldp x6, x7, [tp],#32 + umulh x11, x15, x15 + sub x18, x18, #1 + cbz x18, L(end) + + ALIGN(16) +L(top): extr x9, x6, x5, #63 + mul x10, x17, x17 + ldr x16, [up,#-8] + adcs x13, x9, x11 + ldp x4, x5, [tp,#-16] + umulh x11, x17, x17 + extr x8, x7, x6, #63 + stp x12, x13, [rp],#16 + adcs x12, x8, x10 +L(mid): extr x9, x4, x7, #63 + mul x10, x16, x16 + ldr x17, [up],#16 + adcs x13, x9, x11 + ldp x6, x7, [tp],#32 + umulh x11, x16, x16 + extr x8, x5, x4, #63 + stp x12, x13, [rp],#16 + adcs x12, x8, x10 + sub x18, x18, #1 + cbnz x18, L(top) + +L(end): extr x9, x6, x5, #63 + mul x10, x17, x17 + adcs x13, x9, x11 + umulh x11, x17, x17 + extr x8, x7, x6, #63 + stp x12, x13, [rp] + adcs x12, x8, x10 + extr x9, xzr, x7, #63 + adcs x13, x9, x11 + stp x12, x13, [rp,#16] + + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/generic/compute_powtab.c b/gcc/gmp/mpn/generic/compute_powtab.c new file mode 100644 index 0000000..f4fbc64 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/compute_powtab.c @@ -1,0 +1,373 @@ +/* mpn_compute_powtab. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* + CAVEATS: + * The exptab and powtab vectors are in opposite orders. Probably OK. + * Consider getting rid of exptab, doing bit ops on the un argument instead. + * Consider rounding greatest power slightly upwards to save adjustments. + * In powtab_decide, consider computing cost from just the 2-3 largest + operands, since smaller operand contribute little. This makes most sense + if exptab is suppressed. +*/ + +#include "gmp-impl.h" + +#ifndef DIV_1_VS_MUL_1_PERCENT +#define DIV_1_VS_MUL_1_PERCENT 150 +#endif + +#define SET_powers_t(dest, ptr, size, dib, b, sh) \ + do { \ + dest.p = ptr; \ + dest.n = size; \ + dest.digits_in_base = dib; \ + dest.base = b; \ + dest.shift = sh; \ + } while (0) + +#if DIV_1_VS_MUL_1_PERCENT > 120 +#define HAVE_mpn_compute_powtab_mul 1 +static void +mpn_compute_powtab_mul (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_size_t n; + mp_ptr p, t; + mp_limb_t cy; + long start_idx; + int c; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + t = powtab_mem_ptr; + powtab_mem_ptr += 2; + t[1] = mpn_mul_1 (t, p, 1, big_base); + n = 2; + + digits_in_base *= 2; + + c = t[0] == 0; + t += c; + n -= c; + mp_size_t shift = c; + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + + if (exptab[0] == ((size_t) chars_per_limb << n_pows)) + { + start_idx = n_pows - 2; + } + else + { + if (((digits_in_base + chars_per_limb) << (n_pows-2)) <= exptab[0]) + { + /* 3, sometimes adjusted to 4. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 4; + t[n] = cy = mpn_mul_1 (t, p, n, big_base); + n += cy != 0;; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + else + { + /* 2 copy, will always become 3 with back-multiplication. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 3; + t[0] = p[0]; + t[1] = p[1]; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + start_idx = n_pows - 3; + } + + for (long pi = start_idx; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n + 2; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + + digits_in_base *= 2; + n *= 2; + n -= t[n - 1] == 0; + shift *= 2; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + + /* Adjust new value if it is too small as input to the next squaring. */ + if (((digits_in_base + chars_per_limb) << pi) <= exptab[0]) + { + t[n] = cy = mpn_mul_1 (t, t, n, big_base); + n += cy != 0; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + + /* Adjust previous value if it is not at its target power. */ + if (pt[-1].digits_in_base < exptab[pi + 1]) + { + mp_size_t n = pt[-1].n; + mp_ptr p = pt[-1].p; + p[n] = cy = mpn_mul_1 (p, p, n, big_base); + n += cy != 0; + + ASSERT (pt[-1].digits_in_base + chars_per_limb == exptab[pi + 1]); + pt[-1].digits_in_base = exptab[pi + 1]; + + c = p[0] == 0; + pt[-1].p = p + c; + pt[-1].n = n - c; + pt[-1].shift += c; + } + + p = t; + pt++; + } +} +#endif + +#if DIV_1_VS_MUL_1_PERCENT < 275 +#define HAVE_mpn_compute_powtab_div 1 +static void +mpn_compute_powtab_div (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_ptr p, t; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + mp_size_t n = 1; + mp_size_t shift = 0; + for (long pi = n_pows - 1; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + n = 2 * n - 1; n += t[n] != 0; + digits_in_base *= 2; + + if (digits_in_base != exptab[pi]) /* if ((((un - 1) >> pi) & 2) == 0) */ + { +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 + if (__GMP_LIKELY (base == 10)) + mpn_pi1_bdiv_q_1 (t, t, n, big_base >> MP_BASES_BIG_BASE_CTZ_10, + MP_BASES_BIG_BASE_BINVERTED_10, + MP_BASES_BIG_BASE_CTZ_10); + else +#endif + /* FIXME: We could use _pi1 here if we add big_base_binverted and + big_base_ctz fields to struct bases. That would add about 2 KiB + to mp_bases.c. + FIXME: Use mpn_bdiv_q_1 here when mpn_divexact_1 is converted to + mpn_bdiv_q_1 for more machines. */ + mpn_divexact_1 (t, t, n, big_base); + + n -= t[n - 1] == 0; + digits_in_base -= chars_per_limb; + } + + shift *= 2; + /* Strip low zero limbs, but be careful to keep the result divisible by + big_base. */ + while (t[0] == 0 && (t[1] & ((big_base & -big_base) - 1)) == 0) + { + t++; + n--; + shift++; + } + p = t; + + SET_powers_t (pt[0], p, n, digits_in_base, base, shift); + pt++; + } + + /* Strip any remaining low zero limbs. */ + pt -= n_pows + 1; + for (long pi = n_pows; pi >= 0; pi--) + { + mp_ptr t = pt[pi].p; + mp_size_t shift = pt[pi].shift; + mp_size_t n = pt[pi].n; + int c; + c = t[0] == 0; + t += c; + n -= c; + shift += c; + pt[pi].p = t; + pt[pi].shift = shift; + pt[pi].n = n; + } +} +#endif + +static long +powtab_decide (size_t *exptab, size_t un, int base) +{ + int chars_per_limb = mp_bases[base].chars_per_limb; + long n_pows = 0; + for (size_t pn = (un + 1) >> 1; pn != 1; pn = (pn + 1) >> 1) + { + exptab[n_pows] = pn * chars_per_limb; + n_pows++; + } + exptab[n_pows] = chars_per_limb; + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + size_t pn = un - 1; + size_t xn = (un + 1) >> 1; + unsigned mcost = 1; + unsigned dcost = 1; + for (long i = n_pows - 2; i >= 0; i--) + { + size_t pow = (pn >> (i + 1)) + 1; + + if (pow & 1) + dcost += pow; + + if (xn != (pow << i)) + { + if (pow > 2 && (pow & 1) == 0) + mcost += 2 * pow; + else + mcost += pow; + } + else + { + if (pow & 1) + mcost += pow; + } + } + + dcost = dcost * DIV_1_VS_MUL_1_PERCENT / 100; + + if (mcost <= dcost) + return n_pows; + else + return -n_pows; +#elif HAVE_mpn_compute_powtab_mul + return n_pows; +#elif HAVE_mpn_compute_powtab_div + return -n_pows; +#else +#error "no powtab function available" +#endif +} + +size_t +mpn_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, int base) +{ + size_t exptab[GMP_LIMB_BITS]; + + long n_pows = powtab_decide (exptab, un, base); + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + if (n_pows >= 0) + { + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; + } + else + { + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; + } +#elif HAVE_mpn_compute_powtab_mul + ASSERT (n_pows > 0); + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; +#elif HAVE_mpn_compute_powtab_div + ASSERT (n_pows < 0); + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; +#else +#error "no powtab function available" +#endif +} diff --git a/gcc/gmp/mpn/generic/fib2m.c b/gcc/gmp/mpn/generic/fib2m.c new file mode 100644 index 0000000..89d2b86 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/fib2m.c @@ -1,0 +1,252 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato, based on the previous +fib2_ui.c file. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" +#include "longlong.h" + + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} + +/* Store F[n] at fp and F[n-1] at f1p. Both are computed modulo m. + fp and f1p should have room for mn*2+1 limbs. + + The sign of one or both the values may be flipped (n-F, instead of F), + the return value is 0 (zero) if the signs are coherent (both positive + or both negative) and 1 (one) otherwise. + + Notes: + + In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the + low limb. + + In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the + low limb. + + TODO: Should {tp, 2 * mn} be passed as a scratch pointer? + Should the call to mpn_fib2_ui() obtain (up to) 2*mn limbs? +*/ + +int +mpn_fib2m (mp_ptr fp, mp_ptr f1p, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn) +{ + unsigned long nfirst; + mp_limb_t nh; + mp_bitcnt_t nbi; + mp_size_t sn, fn; + int fcnt, ncnt; + + ASSERT (! MPN_OVERLAP_P (fp, MAX(2*mn+1,5), f1p, MAX(2*mn+1,5))); + ASSERT (nn > 0 && np[nn - 1] != 0); + + /* Estimate the maximal n such that fibonacci(n) fits in mn limbs. */ +#if GMP_NUMB_BITS % 16 == 0 + if (UNLIKELY (ULONG_MAX / (23 * (GMP_NUMB_BITS / 16)) <= mn)) + nfirst = ULONG_MAX; + else + nfirst = mn * (23 * (GMP_NUMB_BITS / 16)); +#else + { + mp_bitcnt_t mbi; + mbi = (mp_bitcnt_t) mn * GMP_NUMB_BITS; + + if (UNLIKELY (ULONG_MAX / 23 < mbi)) + { + if (UNLIKELY (ULONG_MAX / 23 * 16 <= mbi)) + nfirst = ULONG_MAX; + else + nfirst = mbi / 16 * 23; + } + else + nfirst = mbi * 23 / 16; + } +#endif + + sn = nn - 1; + nh = np[sn]; + count_leading_zeros (ncnt, nh); + count_leading_zeros (fcnt, nfirst); + + if (fcnt >= ncnt) + { + ncnt = fcnt - ncnt; + nh >>= ncnt; + } + else if (sn > 0) + { + ncnt -= fcnt; + nh <<= ncnt; + ncnt = GMP_NUMB_BITS - ncnt; + --sn; + nh |= np[sn] >> ncnt; + } + else + ncnt = 0; + + nbi = sn * GMP_NUMB_BITS + ncnt; + if (nh > nfirst) + { + nh >>= 1; + ++nbi; + } + + ASSERT (nh <= nfirst); + /* Take a starting pair from mpn_fib2_ui. */ + fn = mpn_fib2_ui (fp, f1p, nh); + MPN_ZERO (fp + fn, mn - fn); + MPN_ZERO (f1p + fn, mn - fn); + + if (nbi == 0) + { + if (fn == mn) + { + mp_limb_t qp[2]; + mpn_tdiv_qr (qp, fp, 0, fp, fn, mp, mn); + mpn_tdiv_qr (qp, f1p, 0, f1p, fn, mp, mn); + } + + return 0; + } + else + { + mp_ptr tp; + unsigned pb = nh & 1; + int neg; + TMP_DECL; + + TMP_MARK; + + tp = TMP_ALLOC_LIMBS (2 * mn + (mn < 2)); + + do + { + mp_ptr rp; + /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from + nbi upwards. + + Based on the next bit of n, we'll double to the pair + fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as + that bit is 0 or 1 respectively. */ + + mpn_sqr (tp, fp, mn); + mpn_sqr (fp, f1p, mn); + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */ + f1p[2 * mn] = mpn_add_n (f1p, tp, fp, 2 * mn); + + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + pb is the low bit of our implied k. */ + + /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */ + ASSERT ((fp[0] & 2) == 0); + ASSERT (pb == (pb & 1)); + ASSERT ((fp[0] + (pb ? 2 : 0)) == (fp[0] | (pb << 1))); + fp[0] |= pb << 1; /* possible -2 */ +#if HAVE_NATIVE_mpn_rsblsh2_n + fp[2 * mn] = 1 + mpn_rsblsh2_n (fp, fp, tp, 2 * mn); + MPN_INCR_U(fp, 2 * mn + 1, (1 ^ pb) << 1); /* possible +2 */ + fp[2 * mn] = (fp[2 * mn] - 1) & GMP_NUMB_MAX; +#else + { + mp_limb_t c; + + c = mpn_lshift (tp, tp, 2 * mn, 2); + tp[0] |= (1 ^ pb) << 1; /* possible +2 */ + c -= mpn_sub_n (fp, tp, fp, 2 * mn); + fp[2 * mn] = c & GMP_NUMB_MAX; + } +#endif + neg = fp[2 * mn] == GMP_NUMB_MAX; + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2 */ + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k */ + + /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of + F[2k+1] and F[2k-1]. */ + --nbi; + pb = (np [nbi / GMP_NUMB_BITS] >> (nbi % GMP_NUMB_BITS)) & 1; + rp = pb ? f1p : fp; + if (neg) + { + /* Calculate -(F[2k+1] - F[2k-1]) */ + rp[2 * mn] = f1p[2 * mn] + 1 - mpn_sub_n (rp, f1p, fp, 2 * mn); + neg = ! pb; + if (pb) /* fp not overwritten, negate it. */ + fp [2 * mn] = 1 ^ mpn_neg (fp, fp, 2 * mn); + } + else + { + neg = abs_sub_n (rp, fp, f1p, 2 * mn + 1) < 0; + } + + mpn_tdiv_qr (tp, fp, 0, fp, 2 * mn + 1, mp, mn); + mpn_tdiv_qr (tp, f1p, 0, f1p, 2 * mn + 1, mp, mn); + } + while (nbi != 0); + + TMP_FREE; + + return neg; + } +} diff --git a/gcc/gmp/mpn/generic/gcd_11.c b/gcc/gmp/mpn/generic/gcd_11.c new file mode 100644 index 0000000..214e45c 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/gcd_11.c @@ -1,0 +1,74 @@ +/* mpn_gcd_11 -- limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_gcd_11 (mp_limb_t u, mp_limb_t v) +{ + ASSERT (u & v & 1); + + /* In this loop, we represent the odd numbers ulimb and vlimb + without the redundant least significant one bit. This reduction + in size by one bit ensures that the high bit of t, below, is set + if and only if vlimb > ulimb. */ + + u >>= 1; + v >>= 1; + + while (u != v) + { + mp_limb_t t; + mp_limb_t vgtu; + int c; + + t = u - v; + vgtu = LIMB_HIGHBIT_TO_MASK (t); + + /* v <-- min (u, v) */ + v += (vgtu & t); + + /* u <-- |u - v| */ + u = (t ^ vgtu) - vgtu; + + count_trailing_zeros (c, t); + /* We have c <= GMP_LIMB_BITS - 2 here, so that + + ulimb >>= (c + 1); + + would be safe. But unlike the addition c + 1, a separate + shift by 1 is independent of c, and can be executed in + parallel with count_trailing_zeros. */ + u = (u >> 1) >> c; + } + return (u << 1) + 1; +} diff --git a/gcc/gmp/mpn/generic/gcd_22.c b/gcc/gmp/mpn/generic/gcd_22.c new file mode 100644 index 0000000..d97f096 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/gcd_22.c @@ -1,0 +1,131 @@ +/* mpn_gcd_22 -- double limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nails not supported. +#endif + +mp_double_limb_t +mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0) +{ + mp_double_limb_t g; + ASSERT (u0 & v0 & 1); + + /* Implicit least significant bit */ + u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1)); + u1 >>= 1; + + v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1)); + v1 >>= 1; + + while (u1 || v1) /* u1 == 0 can happen at most twice per call */ + { + mp_limb_t vgtu, t1, t0; + sub_ddmmss (t1, t0, u1, u0, v1, v0); + vgtu = LIMB_HIGHBIT_TO_MASK(t1); + + if (UNLIKELY (t0 == 0)) + { + if (t1 == 0) + { + g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1)); + g.d0 = (u0 << 1) | 1; + return g; + } + int c; + count_trailing_zeros (c, t1); + + /* v1 = min (u1, v1) */ + v1 += (vgtu & t1); + /* u0 = |u1 - v1| */ + u0 = (t1 ^ vgtu) - vgtu; + ASSERT (c < GMP_LIMB_BITS - 1); + u0 >>= c + 1; + u1 = 0; + } + else + { + int c; + count_trailing_zeros (c, t0); + c++; + /* V <-- min (U, V). + + Assembly version should use cmov. Another alternative, + avoiding carry propagation, would be + + v0 += vgtu & t0; v1 += vtgu & (u1 - v1); + */ + add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0); + /* U <-- |U - V| + No carry handling needed in this conditional negation, + since t0 != 0. */ + u0 = (t0 ^ vgtu) - vgtu; + u1 = t1 ^ vgtu; + if (UNLIKELY (c == GMP_LIMB_BITS)) + { + u0 = u1; + u1 = 0; + } + else + { + u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c)); + u1 >>= c; + } + } + } + while ((v0 | u0) & GMP_LIMB_HIGHBIT) + { /* At most two iterations */ + mp_limb_t vgtu, t0; + int c; + sub_ddmmss (vgtu, t0, 0, u0, 0, v0); + if (UNLIKELY (t0 == 0)) + { + g.d1 = u0 >> (GMP_LIMB_BITS - 1); + g.d0 = (u0 << 1) | 1; + return g; + } + + /* v <-- min (u, v) */ + v0 += (vgtu & t0); + + /* u <-- |u - v| */ + u0 = (t0 ^ vgtu) - vgtu; + + count_trailing_zeros (c, t0); + u0 = (u0 >> 1) >> c; + } + + g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1); + g.d1 = 0; + return g; +} diff --git a/gcc/gmp/mpn/generic/sbpi1_bdiv_r.c b/gcc/gmp/mpn/generic/sbpi1_bdiv_r.c new file mode 100644 index 0000000..a609951 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/sbpi1_bdiv_r.c @@ -1,0 +1,79 @@ +/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse, + returning remainder. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes a binary quotient of size qn = un - dn. + Output: + + Q = -U * D^{-1} mod B^qn, + + R = (U + Q * D) * B^(-qn) + + Stores the dn least significant limbs of R at {up + un - dn, dn}, + and returns the carry from the addition N + Q*D. + + D must be odd. dinv is (-D)^-1 mod B. */ + +mp_limb_t +mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t cy; + + ASSERT (dn > 0); + ASSERT (un > dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + + for (i = un - dn, cy = 0; i != 0; i--) + { + mp_limb_t q = dinv * up[0]; + mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q); + + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + + return cy; +} diff --git a/gcc/gmp/mpn/generic/strongfibo.c b/gcc/gmp/mpn/generic/strongfibo.c new file mode 100644 index 0000000..ffd038a 100644 --- /dev/null +++ b/gcc/gmp/mpn/generic/strongfibo.c @@ -1,0 +1,216 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} + +/* Computes at most count terms of the sequence needed by the + Lucas-Lehmer-Riesel test, indexing backward: + L_i = L_{i+1}^2 - 2 + + The sequence is computed modulo M = {mp, mn}. + The starting point is given in L_{count+1} = {lp, mn}. + The scratch pointed by sp, needs a space of at least 3 * mn + 1 limbs. + + Returns the index i>0 if L_i = 0 (mod M) is found within the + computed count terms of the sequence. Otherwise it returns zero. + + Note: (+/-2)^2-2=2, (+/-1)^2-2=-1, 0^2-2=-2 + */ + +static mp_bitcnt_t +mpn_llriter (mp_ptr lp, mp_srcptr mp, mp_size_t mn, mp_bitcnt_t count, mp_ptr sp) +{ + do + { + mpn_sqr (sp, lp, mn); + mpn_tdiv_qr (sp + 2 * mn, lp, 0, sp, 2 * mn, mp, mn); + if (lp[0] < 5) + { + /* If L^2 % M < 5, |L^2 % M - 2| <= 2 */ + if (mn == 1 || mpn_zero_p (lp + 1, mn - 1)) + return (lp[0] == 2) ? count : 0; + else + MPN_DECR_U (lp, mn, 2); + } + else + lp[0] -= 2; + } while (--count != 0); + return 0; +} + +/* Store the Lucas' number L[n] at lp (maybe), computed modulo m. lp + and scratch should have room for mn*2+1 limbs. + + Returns the size of L[n] normally. + + If F[n] is zero modulo m, or L[n] is, returns 0 and lp is + undefined. +*/ + +static mp_size_t +mpn_lucm (mp_ptr lp, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + int neg; + mp_limb_t cy; + + ASSERT (! MPN_OVERLAP_P (lp, MAX(2*mn+1,5), scratch, MAX(2*mn+1,5))); + ASSERT (nn > 0); + + neg = mpn_fib2m (lp, scratch, np, nn, mp, mn); + + /* F[n] = +/-{lp, mn}, F[n-1] = +/-{scratch, mn} */ + if (mpn_zero_p (lp, mn)) + return 0; + + if (neg) /* One sign is opposite, use sub instead of add. */ + { +#if HAVE_NATIVE_mpn_rsblsh1_n || HAVE_NATIVE_mpn_sublsh1_n +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_rsblsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ +#else + cy = mpn_sublsh1_n (lp, lp, scratch, mn); /* L[n] = -/+(F[n]-(-2F[n-1])) */ + if (cy != 0) + cy = mpn_add_n (lp, lp, mp, mn) - cy; +#endif + if (cy > 1) + cy += mpn_add_n (lp, lp, mp, mn); +#else + cy = mpn_lshift (scratch, scratch, mn, 1); /* 2F[n-1] */ + if (UNLIKELY (cy)) + cy -= mpn_sub_n (lp, scratch, lp, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ + else + abs_sub_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 1); + } + else + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]+F[n])) */ +#else + cy = mpn_lshift (scratch, scratch, mn, 1); + cy+= mpn_add_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 2); + } + while (cy || mpn_cmp (lp, mp, mn) >= 0) + cy -= mpn_sub_n (lp, lp, mp, mn); + MPN_NORMALIZE (lp, mn); + return mn; +} + +int +mpn_strongfibo (mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + mp_ptr lp, sp; + mp_size_t en; + mp_bitcnt_t b0; + TMP_DECL; + +#if GMP_NUMB_BITS % 4 == 0 + b0 = mpn_scan0 (mp, 0); +#else + { + mpz_t m = MPZ_ROINIT_N(mp, mn); + b0 = mpz_scan0 (m, 0); + } + if (UNLIKELY (b0 == mn * GMP_NUMB_BITS)) + { + en = 1; + scratch [0] = 1; + } + else +#endif + { + int cnt = b0 % GMP_NUMB_BITS; + en = b0 / GMP_NUMB_BITS; + if (LIKELY (cnt != 0)) + mpn_rshift (scratch, mp + en, mn - en, cnt); + else + MPN_COPY (scratch, mp + en, mn - en); + en = mn - en; + scratch [0] |= 1; + en -= scratch [en - 1] == 0; + } + TMP_MARK; + + lp = TMP_ALLOC_LIMBS (4 * mn + 6); + sp = lp + 2 * mn + 3; + en = mpn_lucm (sp, scratch, en, mp, mn, lp); + if (en != 0 && LIKELY (--b0 != 0)) + { + mpn_sqr (lp, sp, en); + lp [0] |= 2; /* V^2 + 2 */ + if (LIKELY (2 * en >= mn)) + mpn_tdiv_qr (sp, lp, 0, lp, 2 * en, mp, mn); + else + MPN_ZERO (lp + 2 * en, mn - 2 * en); + if (! mpn_zero_p (lp, mn) && LIKELY (--b0 != 0)) + b0 = mpn_llriter (lp, mp, mn, b0, lp + mn + 1); + } + TMP_FREE; + return (b0 != 0); +} diff --git a/gcc/gmp/mpn/ia64/gcd_11.asm b/gcc/gmp/mpn/ia64/gcd_11.asm new file mode 100644 index 0000000..6137227 100644 --- /dev/null +++ b/gcc/gmp/mpn/ia64/gcd_11.asm @@ -1,0 +1,110 @@ +dnl Itanium-2 mpn_gcd_11 + +dnl Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bitpair (1x1 gcd) +C Itanium: ? +C Itanium 2: 4.5 + + +ASM_START() + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + .rodata + ALIGN(m4_lshift(1,MAXSHIFT)) C align table to allow using dep +ctz_table: + data1 MAXSHIFT +forloop(i,1,MASK, +` data1 m4_count_trailing_zeros(i)-1 +') + +define(`x0', r32) +define(`y0', r33) + +PROLOGUE(mpn_gcd_11) + .prologue + .body + addl r22 = @ltoff(ctz_table), r1 + ;; + ld8 r22 = [r22] + br L(ent) + ;; + + ALIGN(32) +L(top): + .pred.rel "mutex", p6,p7 + {.mmi; (p7) mov y0 = x0 + (p6) sub x0 = x0, y0 + dep r21 = r19, r22, 0, MAXSHIFT C concat(table,lowbits) +}{.mmi; and r20 = MASK, r19 + (p7) mov x0 = r19 + and r23 = 6, r19 + ;; +}{.mmi; cmp.eq p6,p0 = 4, r23 + cmp.eq p7,p0 = 0, r23 + shr.u x0 = x0, 1 C shift-by-1, always OK +}{.mmb; ld1 r16 = [r21] + cmp.eq p10,p0 = 0, r20 + (p10) br.spnt.few.clr L(count_better) + ;; +} +L(bck): + .pred.rel "mutex", p6,p7 + {.mii; nop 0 + (p6) shr.u x0 = x0, 1 C u was ...100 before shift-by-1 above + (p7) shr.u x0 = x0, r16 C u was ...000 before shift-by-1 above + ;; +} +L(ent): + {.mmi; sub r19 = y0, x0 + cmp.gtu p6,p7 = x0, y0 + cmp.ne p8,p0 = x0, y0 +}{.mmb; nop 0 + nop 0 + (p8) br.sptk.few.clr L(top) +} + +L(end): mov r8 = y0 + br.ret.sptk.many b0 + +L(count_better): + add r20 = -1, x0 + ;; + andcm r23 = r20, x0 + ;; + popcnt r16 = r23 + br L(bck) +EPILOGUE() diff --git a/gcc/gmp/mpn/sparc64/gcd_11.asm b/gcc/gmp/mpn/sparc64/gcd_11.asm new file mode 100644 index 0000000..5564751 100644 --- /dev/null +++ b/gcc/gmp/mpn/sparc64/gcd_11.asm @@ -1,0 +1,88 @@ +dnl SPARC64 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for SPARC by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C UltraSPARC 1&2: 5.1 +C UltraSPARC 3: 5.0 +C UltraSPARC T1: 11.4 +C UltraSPARC T3: 10 +C UltraSPARC T4: 6 +C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1 + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + + RODATA + TYPE(ctz_table,object) +ctz_table: + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') + SIZE(ctz_table,.-ctz_table) + +define(`u0', `%o0') +define(`v0', `%o1') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_gcd_11) + LEA64(ctz_table, o5, g4) + b L(odd) + mov u0, %o4 + + ALIGN(16) +L(top): movcc %xcc, %o4, v0 C v = min(u,v) + movcc %xcc, %o2, %o0 C u = |v - u] +L(mid): ldub [%o5+%g3], %g5 C + brz,a,pn %g3, L(shift_alot) C + srlx %o0, MAXSHIFT, %o0 + srlx %o0, %g5, %o4 C new u, odd +L(odd): subcc v0, %o4, %o2 C v - u, set flags for branch and movcc + sub %o4, v0, %o0 C u - v + bnz,pt %xcc, L(top) C + and %o2, MASK, %g3 C extract low MAXSHIFT bits from (v-u) + + retl + mov v0, %o0 + +L(shift_alot): + b L(mid) + and %o0, MASK, %g3 C +EPILOGUE() diff --git a/gcc/gmp/mpn/x86/gcd_11.asm b/gcc/gmp/mpn/x86/gcd_11.asm new file mode 100644 index 0000000..af69135 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/gcd_11.asm @@ -1,0 +1,126 @@ +dnl x86 mpn_gcd_11 optimised for processors with slow BSF. + +dnl Based on C version. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl Rudimentary code for x86-32, i.e. for CPUs without cmov. Also, the bsf +dnl instruction is assumed to be so slow it is useless. Instead a teble is +dnl used. +dnl +dnl The loop benefits from OoO, in-order CPUs might want a different loop. +dnl The ebx and ecx registers could be combined if the assigment of ecx were +dnl postponed until ebx died, but that would at least hurt in-order CPUs. + +C cycles/bit (approx) +C AMD K7 ? +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4-2 ? +C Intel P4-3/4 ? +C Intel P6/13 ? +C Intel CNR ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? +C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 + +deflit(MAXSHIFT, 6) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%eax') +define(`v0', `%edx') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + push %ebx + + mov 16(%esp), u0 + mov 20(%esp), v0 + LEAL( ctz_table, %esi) + sub v0, u0 C u = u - v 0 + jz L(end) + + ALIGN(16) +L(top): sbb %ebx, %ebx C mask 1 + mov u0, %edi C 1 + mov u0, %ecx C 1 + and %ebx, %edi C 2 + xor %ebx, u0 C 2 + add %edi, v0 C v = min(u.v) 3 + sub %ebx, u0 C u = |u - v| 3 +L(mid): and $MASK, %ecx C 2 + movzbl (%esi,%ecx), %ecx C 3 + jz L(shift_alot) + shr %cl, u0 C 4 + sub v0, u0 C u = u - v 0,5 + jnz L(top) + +L(end): mov v0, %eax + pop %ebx + pop %esi + pop %edi + ret + +L(shift_alot): + shr $MAXSHIFT, u0 + mov u0, %ecx + jmp L(mid) +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/x86_64/gcd_11.asm b/gcc/gmp/mpn/x86_64/gcd_11.asm new file mode 100644 index 0000000..f9b3bcc 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/gcd_11.asm @@ -1,0 +1,114 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 5.5 +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 7.1 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom 9.1 +C Intel SLM 6.9 +C Intel GLM 6.0 +C Intel GLM+ 5.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 7) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r8) + jmp L(ent) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) +L(mid): and $MASK, R32(%rdx) + movzbl (%r8,%rdx), R32(%rcx) + jz L(shift_alot) + shr R8(%rcx), u0 +L(ent): mov u0, %rax + mov v0, %rdx + sub u0, %rdx + sub v0, u0 + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(shift_alot): + shr $MAXSHIFT, u0 + mov u0, %rdx + jmp L(mid) +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/gcd_22.asm b/gcc/gmp/mpn/x86_64/gcd_22.asm new file mode 100644 index 0000000..78f985f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/gcd_22.asm @@ -1,0 +1,163 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, no tzcnt, no shlx. +dnl We actually use tzcnt here, when table cannot count bits, as tzcnt always +dnl works for our use, and helps a lot for certain CPUs. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 8.9 +C AMD K10 8.8 +C AMD bd1 9.7 +C AMD bd2 7.8 +C AMD bd3 ? +C AMD bd4 7.4 +C AMD bt1 9.2 +C AMD bt2 9.1 +C AMD zn1 7.5 +C AMD zn2 7.5 +C Intel P4 ? +C Intel CNR 10.5 +C Intel PNR 10.5 +C Intel NHM 9.7 +C Intel WSM 9.7 +C Intel SBR 10.7 +C Intel IBR ? +C Intel HWL 9.5 +C Intel BWL 8.7 +C Intel SKL 8.6 +C Intel atom 18.9 +C Intel SLM 14.0 +C Intel GLM 9.8 +C Intel GLM+ 8.8 +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%rcx') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + LEA( ctz_table, %r10) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(count_better): + rep;bsf u0, cnt C tzcnt! + jmp L(shr) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/tests/cxx/t-ops2.h b/gcc/gmp/tests/cxx/t-ops2.h new file mode 100644 index 0000000..f8898ee 100644 --- /dev/null +++ b/gcc/gmp/tests/cxx/t-ops2.h @@ -1,0 +1,82 @@ +/* Test mp*_class operators and functions. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include "config.h" + +#include + +#include "gmpxx.h" +#include "gmp-impl.h" +#include "tests.h" + + +#define CHECK1(Type,a,fun) \ + ASSERT_ALWAYS(fun((Type)(a))==fun(a)) +#define CHECK(Type1,Type2,a,b,op) \ + ASSERT_ALWAYS(((Type1)(a) op (Type2)(b))==((a) op (b))) +#define CHECK_G(Type,a,b,op) \ + CHECK(Type,Type,a,b,op) +#define CHECK_UI(Type,a,b,op) \ + CHECK(Type,unsigned long,a,b,op); \ + CHECK(unsigned long,Type,a,b,op) +#define CHECK_SI(Type,a,b,op) \ + CHECK(Type,long,a,b,op); \ + CHECK(long,Type,a,b,op) +#define CHECK_D(Type,a,b,op) \ + CHECK(Type,double,a,b,op); \ + CHECK(double,Type,a,b,op) +#define CHECK_MPZ(Type,a,b,op) \ + CHECK(Type,mpz_class,a,b,op); \ + CHECK(mpz_class,Type,a,b,op) +#define CHECK_MPQ(Type,a,b,op) \ + CHECK(Type,mpq_class,a,b,op); \ + CHECK(mpq_class,Type,a,b,op) +#define CHECK_ALL_SIGNED(Type,a,b,op) \ + CHECK_G(Type,a,b,op); \ + CHECK_SI(Type,a,b,op); \ + CHECK_D(Type,a,b,op) +#define CHECK_ALL_SIGNS(Type,a,b,op) \ + CHECK_ALL_SIGNED(Type,a,b,op); \ + CHECK_ALL_SIGNED(Type,-(a),b,op); \ + CHECK_ALL_SIGNED(Type,a,-(b),op); \ + CHECK_ALL_SIGNED(Type,-(a),-(b),op) +#define CHECK_ALL(Type,a,b,op) \ + CHECK_ALL_SIGNED(Type,a,b,op); \ + CHECK_UI(Type,a,b,op) +#define CHECK_ALL_SIGNED_COMPARISONS(Type,a,b) \ + CHECK_ALL_SIGNED(Type,a,b,<); \ + CHECK_ALL_SIGNED(Type,a,b,>); \ + CHECK_ALL_SIGNED(Type,a,b,<=); \ + CHECK_ALL_SIGNED(Type,a,b,>=); \ + CHECK_ALL_SIGNED(Type,a,b,==); \ + CHECK_ALL_SIGNED(Type,a,b,!=) +#define CHECK_ALL_SIGNS_COMPARISONS(Type,a,b) \ + CHECK_ALL_SIGNS(Type,a,b,<); \ + CHECK_ALL_SIGNS(Type,a,b,>); \ + CHECK_ALL_SIGNS(Type,a,b,<=); \ + CHECK_ALL_SIGNS(Type,a,b,>=); \ + CHECK_ALL_SIGNS(Type,a,b,==); \ + CHECK_ALL_SIGNS(Type,a,b,!=) +#define CHECK_ALL_COMPARISONS(Type,a,b) \ + CHECK_ALL(Type,a,b,<); \ + CHECK_ALL(Type,a,b,>); \ + CHECK_ALL(Type,a,b,<=); \ + CHECK_ALL(Type,a,b,>=); \ + CHECK_ALL(Type,a,b,==); \ + CHECK_ALL(Type,a,b,!=) diff --git a/gcc/gmp/tests/cxx/t-ops2f.cc b/gcc/gmp/tests/cxx/t-ops2f.cc new file mode 100644 index 0000000..71c9e10 100644 --- /dev/null +++ b/gcc/gmp/tests/cxx/t-ops2f.cc @@ -1,0 +1,87 @@ +/* Test mp*_class operators and functions. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include "t-ops2.h" + +void checkf (){ + ASSERT_ALWAYS(sqrt(mpf_class(7))>2.64); + ASSERT_ALWAYS(sqrt(mpf_class(7))<2.65); + ASSERT_ALWAYS(sqrt(mpf_class(0))==0); + // TODO: add some consistency checks, as described in + // https://gmplib.org/list-archives/gmp-bugs/2013-February/002940.html + CHECK1(mpf_class,1.9,trunc); + CHECK1(mpf_class,1.9,floor); + CHECK1(mpf_class,1.9,ceil); + CHECK1(mpf_class,4.3,trunc); + CHECK1(mpf_class,4.3,floor); + CHECK1(mpf_class,4.3,ceil); + CHECK1(mpf_class,-7.1,trunc); + CHECK1(mpf_class,-7.1,floor); + CHECK1(mpf_class,-7.1,ceil); + CHECK1(mpf_class,-2.8,trunc); + CHECK1(mpf_class,-2.8,floor); + CHECK1(mpf_class,-2.8,ceil); + CHECK1(mpf_class,-1.5,trunc); + CHECK1(mpf_class,-1.5,floor); + CHECK1(mpf_class,-1.5,ceil); + CHECK1(mpf_class,2.5,trunc); + CHECK1(mpf_class,2.5,floor); + CHECK1(mpf_class,2.5,ceil); + ASSERT_ALWAYS(hypot(mpf_class(-3),mpf_class(4))>4.9); + ASSERT_ALWAYS(hypot(mpf_class(-3),mpf_class(4))<5.1); + ASSERT_ALWAYS(hypot(mpf_class(-3),4.)>4.9); + ASSERT_ALWAYS(hypot(-3.,mpf_class(4))<5.1); + ASSERT_ALWAYS(hypot(mpf_class(-3),4l)>4.9); + ASSERT_ALWAYS(hypot(-3l,mpf_class(4))<5.1); + ASSERT_ALWAYS(hypot(mpf_class(-3),4ul)>4.9); + ASSERT_ALWAYS(hypot(3ul,mpf_class(4))<5.1); + CHECK(mpf_class,mpq_class,1.5,2.25,+); + CHECK(mpf_class,mpq_class,1.5,2.25,-); + CHECK(mpf_class,mpq_class,1.5,-2.25,*); + CHECK(mpf_class,mpq_class,1.5,-2,/); + CHECK_MPQ(mpf_class,-5.5,-2.25,+); + CHECK_MPQ(mpf_class,-5.5,-2.25,-); + CHECK_MPQ(mpf_class,-5.5,-2.25,*); + CHECK_MPQ(mpf_class,-5.25,-0.5,/); + CHECK_MPQ(mpf_class,5,-2,<); + CHECK_MPQ(mpf_class,5,-2,>); + CHECK_MPQ(mpf_class,5,-2,<=); + CHECK_MPQ(mpf_class,5,-2,>=); + CHECK_MPQ(mpf_class,5,-2,==); + CHECK_MPQ(mpf_class,5,-2,!=); + CHECK_MPQ(mpf_class,0,0,<); + CHECK_MPQ(mpf_class,0,0,>); + CHECK_MPQ(mpf_class,0,0,<=); + CHECK_MPQ(mpf_class,0,0,>=); + CHECK_MPQ(mpf_class,0,0,==); + CHECK_MPQ(mpf_class,0,0,!=); +} + +int +main (void) +{ + tests_start(); + + // Enough precision for 1 + denorm_min + mpf_set_default_prec(DBL_MANT_DIG-DBL_MIN_EXP+42); + checkf(); + + tests_end(); + return 0; +} diff --git a/gcc/gmp/tests/cxx/t-ops2qf.cc b/gcc/gmp/tests/cxx/t-ops2qf.cc new file mode 100644 index 0000000..bd96f61 100644 --- /dev/null +++ b/gcc/gmp/tests/cxx/t-ops2qf.cc @@ -1,0 +1,89 @@ +/* Test mp*_class operators and functions. + +Copyright 2011, 2012, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include "t-ops2.h" + +template +void checkqf (){ + CHECK_ALL(T,5.,0,+); + CHECK_ALL(T,5.,0,-); + CHECK_ALL(T,5.,2,+); CHECK_MPZ(T,5.,2,+); + CHECK_ALL(T,5.,2,-); CHECK_MPZ(T,5.,2,-); + CHECK_ALL(T,5.,2,*); CHECK_MPZ(T,5.,2,*); + CHECK_ALL(T,5.,2,/); CHECK_MPZ(T,5.,2,/); + CHECK_ALL(T,0.,2,/); + CHECK_ALL_SIGNS(T,11.,3,+); + CHECK_ALL_SIGNS(T,11.,3,-); + CHECK_ALL_SIGNS(T,13.,1,+); + CHECK_ALL_SIGNS(T,13.,1,-); + CHECK_ALL_SIGNS(T,11.,3,*); + CHECK_ALL_SIGNS(T,11.,4,/); + CHECK_SI(T,LONG_MIN,1,*); + CHECK_SI(T,0,3,*); + CHECK_ALL_COMPARISONS(T,5.,2); + CHECK_ALL_SIGNS_COMPARISONS(T,11.,3); + CHECK_MPZ(T,5,-2,<); + CHECK_MPZ(T,5,-2,>); + CHECK_MPZ(T,5,-2,<=); + CHECK_MPZ(T,5,-2,>=); + CHECK_MPZ(T,5,-2,==); + CHECK_MPZ(T,5,-2,!=); + CHECK_MPZ(T,0,0,<); + CHECK_MPZ(T,0,0,>); + CHECK_MPZ(T,0,0,<=); + CHECK_MPZ(T,0,0,>=); + CHECK_MPZ(T,0,0,==); + CHECK_MPZ(T,0,0,!=); + ASSERT_ALWAYS(T(6)<<2==6.*4); + ASSERT_ALWAYS(T(6)>>2==6./4); + ASSERT_ALWAYS(T(-13)<<2==-13.*4); + ASSERT_ALWAYS(T(-13)>>2==-13./4); + ASSERT_ALWAYS(++T(7)==8); + ASSERT_ALWAYS(++T(-8)==-7); + ASSERT_ALWAYS(--T(8)==7); + ASSERT_ALWAYS(--T(-7)==-8); + ASSERT_ALWAYS(+T(7)==7); + ASSERT_ALWAYS(+T(-8)==-8); + ASSERT_ALWAYS(-T(7)==-7); + ASSERT_ALWAYS(-T(-8)==8); + ASSERT_ALWAYS(abs(T(7))==7); + ASSERT_ALWAYS(abs(T(-8))==8); + ASSERT_ALWAYS(sgn(T(0))==0); + ASSERT_ALWAYS(sgn(T(9))==1); + ASSERT_ALWAYS(sgn(T(-17))==-1); + ASSERT_ALWAYS(T(1)+DBL_MAX>2); + ASSERT_ALWAYS(T(1)+DBL_MIN>1); + ASSERT_ALWAYS(T(1)+DBL_MIN<1.001); + ASSERT_ALWAYS(T(1)+std::numeric_limits::denorm_min()>1); + ASSERT_ALWAYS(T(1)+std::numeric_limits::denorm_min()<1.001); +} + +int +main (void) +{ + tests_start(); + + // Enough precision for 1 + denorm_min + mpf_set_default_prec(DBL_MANT_DIG-DBL_MIN_EXP+42); + checkqf(); + checkqf(); + + tests_end(); + return 0; +} diff --git a/gcc/gmp/tests/cxx/t-ops2z.cc b/gcc/gmp/tests/cxx/t-ops2z.cc new file mode 100644 index 0000000..6d0e4ad 100644 --- /dev/null +++ b/gcc/gmp/tests/cxx/t-ops2z.cc @@ -1,0 +1,126 @@ +/* Test mp*_class operators and functions. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include "t-ops2.h" + +void checkz (){ + CHECK_ALL(mpz_class,5,2,+); + CHECK_ALL(mpz_class,5,2,-); + CHECK_ALL(mpz_class,5,2,*); + CHECK_ALL(mpz_class,5,2,/); + CHECK_ALL(mpz_class,5,2,%); + CHECK_ALL_COMPARISONS(mpz_class,5,2); + CHECK_ALL_SIGNS(mpz_class,11,3,+); + CHECK_ALL_SIGNS(mpz_class,11,3,-); + CHECK_ALL_SIGNS(mpz_class,11,3,*); + CHECK_ALL_SIGNS(mpz_class,11,3,/); + CHECK_ALL_SIGNS(mpz_class,11,3,%); + CHECK_ALL_SIGNS(mpz_class,17,2,*); + CHECK_ALL_SIGNS(mpz_class,17,2,/); + CHECK_ALL_SIGNS(mpz_class,17,2,%); + CHECK(unsigned long,mpz_class,5,-2,/); + CHECK(unsigned long,mpz_class,5,-2,%); + ASSERT_ALWAYS(7ul/mpz_class(1e35)==0); + ASSERT_ALWAYS(7ul%mpz_class(1e35)==7); + ASSERT_ALWAYS(7ul/mpz_class(-1e35)==0); + ASSERT_ALWAYS(7ul%mpz_class(-1e35)==7); + CHECK_ALL_SIGNS_COMPARISONS(mpz_class,11,3); + CHECK_ALL(mpz_class,6,3,&); + CHECK_ALL(mpz_class,6,3,|); + CHECK_ALL(mpz_class,6,3,^); + CHECK(mpz_class,unsigned long,6,2,<<); + CHECK(mpz_class,unsigned long,6,2,>>); + ASSERT_ALWAYS((mpz_class(-13)<<(unsigned long)2) == (-13)*4); + CHECK(mpz_class,unsigned long,-13,2,>>); + ASSERT_ALWAYS(++mpz_class(7)==8); + ASSERT_ALWAYS(++mpz_class(-8)==-7); + ASSERT_ALWAYS(--mpz_class(8)==7); + ASSERT_ALWAYS(--mpz_class(-7)==-8); + ASSERT_ALWAYS(~mpz_class(7)==-8); + ASSERT_ALWAYS(~mpz_class(-8)==7); + ASSERT_ALWAYS(+mpz_class(7)==7); + ASSERT_ALWAYS(+mpz_class(-8)==-8); + ASSERT_ALWAYS(-mpz_class(7)==-7); + ASSERT_ALWAYS(-mpz_class(-8)==8); + ASSERT_ALWAYS(abs(mpz_class(7))==7); + ASSERT_ALWAYS(abs(mpz_class(-8))==8); + ASSERT_ALWAYS(sqrt(mpz_class(7))==2); + ASSERT_ALWAYS(sqrt(mpz_class(0))==0); + ASSERT_ALWAYS(sgn(mpz_class(0))==0); + ASSERT_ALWAYS(sgn(mpz_class(9))==1); + ASSERT_ALWAYS(sgn(mpz_class(-17))==-1); + ASSERT_ALWAYS(mpz_class(1)+DBL_MAX>2); + ASSERT_ALWAYS(mpz_class(1)+DBL_MIN<2); + ASSERT_ALWAYS(mpz_class(1)+std::numeric_limits::denorm_min()<2); + ASSERT_ALWAYS(gcd(mpz_class(6),mpz_class(8))==2); + ASSERT_ALWAYS(gcd(-mpz_class(6),mpz_class(8))==2); + ASSERT_ALWAYS(gcd(-mpz_class(6),-mpz_class(8))==2); + ASSERT_ALWAYS(gcd(mpz_class(6),8.f)==2); + ASSERT_ALWAYS(gcd(-mpz_class(6),static_cast(8))==2); + ASSERT_ALWAYS(gcd(static_cast(-6),mpz_class(5)+3)==2); + ASSERT_ALWAYS(lcm(mpz_class(6),mpz_class(8))==24); + ASSERT_ALWAYS(lcm(-mpz_class(6),mpz_class(8))==24); + ASSERT_ALWAYS(lcm(-mpz_class(6),-mpz_class(8))==24); + ASSERT_ALWAYS(lcm(mpz_class(6),static_cast(8))==24); + ASSERT_ALWAYS(lcm(-mpz_class(6),static_cast(8))==24); + ASSERT_ALWAYS(lcm(-6.,mpz_class(5)+3)==24); + ASSERT_ALWAYS(factorial(mpz_class(3))==6); + ASSERT_ALWAYS(factorial(mpz_class(5)-1)==24); + ASSERT_ALWAYS(mpz_class::factorial(mpz_class(3))==6); + ASSERT_ALWAYS(mpz_class::factorial(mpz_class(2)*2)==24); + ASSERT_ALWAYS(mpz_class::factorial(3)==6); + ASSERT_ALWAYS(mpz_class::factorial(3ul)==6); + ASSERT_ALWAYS(mpz_class::factorial(3.f)==6); + mpz_class ret; + try { ret=factorial(-mpz_class(3)); ASSERT_ALWAYS(0); } + catch (std::domain_error) {} + try { ret=mpz_class::factorial(-2); ASSERT_ALWAYS(0); } + catch (std::domain_error) {} + try { ret=factorial(mpz_class(1)<<300); ASSERT_ALWAYS(0); } + catch (std::bad_alloc) {} + ASSERT_ALWAYS(mpz_class::primorial(mpz_class(3))==6); + ASSERT_ALWAYS(mpz_class::primorial(mpz_class(2)*2)==6); + ASSERT_ALWAYS(mpz_class::primorial(3)==6); + ASSERT_ALWAYS(mpz_class::primorial(3ul)==6); + ASSERT_ALWAYS(mpz_class::primorial(3.f)==6); + try { ret=primorial(-mpz_class(3)); ASSERT_ALWAYS(0); } + catch (std::domain_error) {} + try { ret=mpz_class::primorial(-5); ASSERT_ALWAYS(0); } + catch (std::domain_error) {} + try { ret=primorial(mpz_class(1)<<300); ASSERT_ALWAYS(0); } + catch (std::bad_alloc) {} + ASSERT_ALWAYS(mpz_class::fibonacci(mpz_class(6))==8); + ASSERT_ALWAYS(mpz_class::fibonacci(mpz_class(2)*2)==3); + ASSERT_ALWAYS(mpz_class::fibonacci(3)==2); + ASSERT_ALWAYS(mpz_class::fibonacci(3ul)==2); + ASSERT_ALWAYS(mpz_class::fibonacci(3.f)==2); + ASSERT_ALWAYS(fibonacci(-mpz_class(6))==-8); + ASSERT_ALWAYS(mpz_class::fibonacci(-3)==2); + try { ret=fibonacci(mpz_class(1)<<300); ASSERT_ALWAYS(0); } + catch (std::bad_alloc) {} +} + +int +main (void) +{ + tests_start(); + checkz(); + tests_end(); + return 0; +} diff --git a/gcc/gmp/tests/devel/addmul_N.c b/gcc/gmp/tests/devel/addmul_N.c new file mode 100644 index 0000000..410e291 100644 --- /dev/null +++ b/gcc/gmp/tests/devel/addmul_N.c @@ -1,0 +1,272 @@ +/* +Copyright 1996-2002, 2004, 2007 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include "gmp-impl.h" +#include "longlong.h" + +#if defined (USG) || defined (__SVR4) || defined (_UNICOS) || defined (__hpux) +#include + +int +cputime () +{ + if (CLOCKS_PER_SEC < 100000) + return clock () * 1000 / CLOCKS_PER_SEC; + return clock () / (CLOCKS_PER_SEC / 1000); +} +#else +#include +#include +#include + +int +cputime () +{ + struct rusage rus; + + getrusage (0, &rus); + return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; +} +#endif + +#ifndef NOCHECK +static void print_posneg (mp_limb_t); +#endif +#ifdef PRINT +static void mpn_print (mp_ptr, mp_size_t); +#endif + +#define LXW ((int) (2 * sizeof (mp_limb_t))) +#define M * 1000000 + +#ifndef CLOCK +#error "Don't know CLOCK of your machine" +#endif + +#ifndef OPS +#define OPS (CLOCK/5) +#endif +#ifndef SIZE +#define SIZE 496 +#endif +#ifndef TIMES +#define TIMES OPS/(SIZE+1) +#endif + +#if N == 2 +#define mpn_addmul_N mpn_addmul_2 +#elif N == 3 +#define mpn_addmul_N mpn_addmul_3 +#elif N == 4 +#define mpn_addmul_N mpn_addmul_4 +#elif N == 5 +#define mpn_addmul_N mpn_addmul_5 +#elif N == 6 +#define mpn_addmul_N mpn_addmul_6 +#elif N == 7 +#define mpn_addmul_N mpn_addmul_7 +#elif N == 8 +#define mpn_addmul_N mpn_addmul_8 +#endif + +mp_limb_t +refmpn_addmul_N (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp) +{ + int i; + for (i = 1; i < N; i++) + { + rp[n] = mpn_addmul_1 (rp, up, n, *vp); + rp++; + vp++; + } + return mpn_addmul_1 (rp, up, n, *vp); +} + +int +main (int argc, char **argv) +{ + mp_limb_t up[SIZE]; + mp_limb_t ref[SIZE + N - 1]; + mp_limb_t mem[SIZE + N + 1]; + mp_ptr rp = mem + 1; + mp_limb_t vp[N]; + mp_limb_t cy_ref, cy_try; + int i; +#if TIMES != 1 + long t0, t; + double cyc; +#endif + unsigned test; + mp_size_t size; + unsigned ntests; + + ntests = ~(unsigned) 0; + if (argc == 2) + ntests = strtol (argv[1], 0, 0); + + for (test = 1; test <= ntests; test++) + { +#if TIMES == 1 && ! defined (PRINT) + if (test % (CLOCK / SIZE / 1000) == 0) + { + printf ("\r%u", test); + fflush (stdout); + } +#endif + +#ifdef RANDOM + size = random () % (SIZE - N + 1) + N; +#else + size = SIZE; +#endif + + rp[size + N - 1] = 0x12345678; + rp[-1] = 0x87654321; + + mpn_random (vp, N); + +#if TIMES != 1 /* run timing tests unless asked not to */ + mpn_random (up, size); + mpn_random (rp, size + N - 1); + + MPN_COPY (ref, rp, size + N - 1); + t0 = cputime(); + for (i = 0; i < TIMES; i++) + mpn_addmul_N (ref, up, size, vp); + t = cputime() - t0; + cyc = ((double) t * CLOCK) / (TIMES * size * 1000.0) / N; + printf ("mpn_addmul_%d: %5ldms (%.3f cycles/limb) [%.2f Gb/s]\n", + N, t, cyc, CLOCK/cyc*GMP_LIMB_BITS*GMP_LIMB_BITS/1e9); +#endif + +#ifdef PLAIN_RANDOM +#define MPN_RANDOM mpn_random +#else +#define MPN_RANDOM mpn_random2 +#endif + +#ifdef ZEROu + MPN_ZERO (up, size); +#else + MPN_RANDOM (up, size); +#endif + MPN_RANDOM (vp, N); +#ifdef ZERO + MPN_ZERO (rp, size + N - 1); +#else + MPN_RANDOM (rp, size + N - 1); +#endif + +#if defined (PRINT) || defined (PRINTV) + printf ("vp="); + mpn_print (vp, N); +#endif +#ifdef PRINT + printf ("%*s ", 3 + N * LXW, ""); + mpn_print (rp, size); + printf ("%*s ", 3 + N * LXW, ""); + mpn_print (up, size); +#endif + + MPN_COPY (ref, rp, size + N - 1); + cy_ref = refmpn_addmul_N (ref, up, size, vp); + cy_try = mpn_addmul_N (rp, up, size, vp); + +#ifdef PRINT + printf ("%*lX ", LXW, cy_ref); + mpn_print (ref, size + N - 1); + printf ("%*lX ", LXW, cy_try); + mpn_print (rp, size + N - 1); +#endif + +#ifndef NOCHECK + if (cy_ref != cy_try || mpn_cmp (ref, rp, size + N - 1) != 0 + || rp[size + N - 1] != 0x12345678 || rp[-1] != 0x87654321) + { + printf ("\n ref%*s try%*s diff\n", LXW - 3, "", 2 * LXW - 6, ""); + for (i = 0; i < size + N - 1; i++) + { + printf ("%6d: ", i); + printf ("%0*llX ", LXW, (unsigned long long) ref[i]); + printf ("%0*llX ", LXW, (unsigned long long) rp[i]); + print_posneg (rp[i] - ref[i]); + printf ("\n"); + } + printf ("retval: "); + printf ("%0*llX ", LXW, (unsigned long long) cy_ref); + printf ("%0*llX ", LXW, (unsigned long long) cy_try); + print_posneg (cy_try - cy_ref); + printf ("\n"); + if (rp[-1] != 0x87654321) + printf ("clobbered at low end\n"); + if (rp[size + N - 1] != 0x12345678) + printf ("clobbered at high end\n"); + printf ("TEST NUMBER %u\n", test); + abort(); + } +#endif + } + exit (0); +} + +#ifndef NOCHECK +static void +print_posneg (mp_limb_t d) +{ + char buf[LXW + 2]; + if (d == 0) + printf (" %*X", LXW, 0); + else if (-d < d) + { + sprintf (buf, "%llX", (unsigned long long) -d); + printf ("%*s-%s", LXW - (int) strlen (buf), "", buf); + } + else + { + sprintf (buf, "%llX", (unsigned long long) d); + printf ("%*s+%s", LXW - (int) strlen (buf), "", buf); + } +} +#endif + +#ifdef PRINT +static void +mpn_print (mp_ptr p, mp_size_t size) +{ + mp_size_t i; + + for (i = size - 1; i >= 0; i--) + { +#ifdef _LONG_LONG_LIMB + printf ("%0*lX%0*lX", (int) (sizeof(mp_limb_t)), + (unsigned long) (p[i] >> (GMP_LIMB_BITS/2)), + (int) (sizeof(mp_limb_t)), (unsigned long) (p[i])); +#else + printf ("%0*lX", LXW, p[i]); +#endif +#ifdef SPACE + if (i != 0) + printf (" "); +#endif + } + puts (""); +} +#endif diff --git a/gcc/gmp/tests/devel/cnd_aors_n.c b/gcc/gmp/tests/devel/cnd_aors_n.c new file mode 100644 index 0000000..00d6db1 100644 --- /dev/null +++ b/gcc/gmp/tests/devel/cnd_aors_n.c @@ -1,0 +1,257 @@ +/* +Copyright 1996-2004, 2009, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include "gmp-impl.h" +#include "tests/tests.h" + +#ifdef OPERATION_cnd_add_n +#define func __gmpn_cnd_add_n +#define reffunc refmpn_cnd_add_n +#define funcname "mpn_cnd_add_n" +#endif + +#ifdef OPERATION_cnd_sub_n +#define func __gmpn_cnd_sub_n +#define reffunc refmpn_cnd_sub_n +#define funcname "mpn_cnd_sub_n" +#endif + +#if defined (USG) || defined (__SVR4) || defined (_UNICOS) || defined (__hpux) +#include + +int +cputime () +{ + if (CLOCKS_PER_SEC < 100000) + return clock () * 1000 / CLOCKS_PER_SEC; + return clock () / (CLOCKS_PER_SEC / 1000); +} +#else +#include +#include +#include + +int +cputime () +{ + struct rusage rus; + + getrusage (0, &rus); + return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; +} +#endif + +static void print_posneg (mp_limb_t); +static void mpn_print (mp_ptr, mp_size_t); + +#define LXW ((int) (2 * sizeof (mp_limb_t))) +#define M * 1000000 + +#ifndef CLOCK +#error "Don't know CLOCK of your machine" +#endif + +#ifndef OPS +#define OPS (CLOCK/5) +#endif +#ifndef SIZE +#define SIZE 328 +#endif +#ifndef TIMES +#define TIMES OPS/(SIZE+1) +#endif + +int +main (int argc, char **argv) +{ + mp_ptr s1, s2, dx, dy; + mp_limb_t cyx, cyy; + int i; +#if TIMES != 1 + long t0, t; +#endif + unsigned int test; + mp_size_t size; + unsigned int ntests; + + s1 = malloc ((SIZE + 2) * sizeof (mp_limb_t)); + s2 = malloc ((SIZE + 2) * sizeof (mp_limb_t)); + dx = malloc ((SIZE + 2) * sizeof (mp_limb_t)); + dy = malloc ((SIZE + 2) * sizeof (mp_limb_t)); + + s1 += 1; + s2 += 1; + + ntests = ~(unsigned) 0; + if (argc == 2) + ntests = strtol (argv[1], 0, 0); + + for (test = 1; test <= ntests; test++) + { +#if TIMES == 1 && ! defined (PRINT) + if (test % (SIZE > 100000 ? 1 : 100000 / SIZE) == 0) + { + printf ("\r%u", test); + fflush (stdout); + } +#endif + +#ifdef PLAIN_RANDOM +#define MPN_RANDOM mpn_random +#else +#define MPN_RANDOM mpn_random2 +#endif + +#ifdef RANDOM + size = random () % SIZE + 1; +#else + size = SIZE; +#endif + + dx[0] = 0x87654321; + dy[0] = 0x87654321; + dx[size+1] = 0x12345678; + dy[size+1] = 0x12345678; + +#if TIMES != 1 + mpn_random (s1 - 1, size + 2); + mpn_random (s2 - 1, size + 2); + + t0 = cputime(); + for (i = 0; i < TIMES; i++) + func (i & 1, dx+1, s1, s2, size); + t = cputime() - t0; + printf (funcname ": %5ldms (%.3f cycles/limb)\n", + t, ((double) t * CLOCK) / (TIMES * size * 1000.0)); +#endif + +#ifndef NOCHECK +#ifndef ZEROup + MPN_RANDOM (s1 - 1, size + 2); +#else + MPN_ZERO (s1, size); +#endif +#ifndef ZEROvp + MPN_RANDOM (s2 - 1, size + 2); +#else + MPN_ZERO (s2, size); +#endif + +#ifdef PRINT + mpn_print (s1, size); + mpn_print (s2, size); +#endif + + /* Put garbage in the destination. */ + for (i = 0; i < size; i++) + { + dx[i+1] = 0xdead; + dy[i+1] = 0xbeef; + } + + int cond = random() & 1; + + cyx = reffunc (cond, dx+1, s1, s2, size); + cyy = func (cond, dy+1, s1, s2, size); + +#ifdef PRINT + mpn_print (&cyx, 1); + mpn_print (dx+1, size); + mpn_print (&cyy, 1); + mpn_print (dy+1, size); +#endif + + if (cyx != cyy || mpn_cmp (dx, dy, size+2) != 0 + || dx[0] != 0x87654321 || dx[size+1] != 0x12345678) + { + mp_size_t s, e; + for (s = 0;; s++) + if ((unsigned long long) (dx+1)[s] != (unsigned long long) (dy+1)[s]) + break; + for (e = size - 1;; e--) + if ((unsigned long long) (dx+1)[e] != (unsigned long long) (dy+1)[e]) + break; +#ifndef PRINT + for (i = s; i <= e; i++) + { + printf ("%6d: ", i); + printf ("%0*llX ", LXW, (unsigned long long) (dx+1)[i]); + printf ("%0*llX ", LXW, (unsigned long long) (dy+1)[i]); + print_posneg ((dy+1)[i] - (dx+1)[i]); + printf ("\n"); + } + printf ("%6s: ", "retval"); + printf ("%0*llX ", LXW, (unsigned long long) cyx); + printf ("%0*llX ", LXW, (unsigned long long) cyy); + print_posneg (cyx - cyy); +#endif + printf ("\n"); + if (dy[0] != 0x87654321) + printf ("clobbered at low end\n"); + if (dy[size+1] != 0x12345678) + printf ("clobbered at high end\n"); + printf ("TEST NUMBER %u\n", test); + abort(); + } +#endif + } + exit (0); +} + +static void +print_posneg (mp_limb_t d) +{ + char buf[LXW + 2]; + if (d == 0) + printf (" %*X", LXW, 0); + else if (-d < d) + { + sprintf (buf, "%llX", (unsigned long long) -d); + printf ("%*s-%s", LXW - (int) strlen (buf), "", buf); + } + else + { + sprintf (buf, "%llX", (unsigned long long) d); + printf ("%*s+%s", LXW - (int) strlen (buf), "", buf); + } +} + +static void +mpn_print (mp_ptr p, mp_size_t size) +{ + mp_size_t i; + + for (i = size - 1; i >= 0; i--) + { +#ifdef _LONG_LONG_LIMB + printf ("%0*lX%0*lX", (int) (sizeof(mp_limb_t)), + (unsigned long) (p[i] >> (GMP_LIMB_BITS/2)), + (int) (sizeof(mp_limb_t)), (unsigned long) (p[i])); +#else + printf ("%0*lX", (int) (2 * sizeof(mp_limb_t)), p[i]); +#endif +#ifdef SPACE + if (i != 0) + printf (" "); +#endif + } + puts (""); +} diff --git a/gcc/gmp/tests/devel/mul_N.c b/gcc/gmp/tests/devel/mul_N.c new file mode 100644 index 0000000..c9de5ec 100644 --- /dev/null +++ b/gcc/gmp/tests/devel/mul_N.c @@ -1,0 +1,270 @@ +/* +Copyright 1996-2002, 2004, 2007 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include +#include "gmp-impl.h" +#include "longlong.h" + +#if defined (USG) || defined (__SVR4) || defined (_UNICOS) || defined (__hpux) +#include + +int +cputime () +{ + if (CLOCKS_PER_SEC < 100000) + return clock () * 1000 / CLOCKS_PER_SEC; + return clock () / (CLOCKS_PER_SEC / 1000); +} +#else +#include +#include +#include + +int +cputime () +{ + struct rusage rus; + + getrusage (0, &rus); + return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; +} +#endif + +#ifndef NOCHECK +static void print_posneg (mp_limb_t); +#endif +#ifdef PRINT +static void mpn_print (mp_ptr, mp_size_t); +#endif + +#define LXW ((int) (2 * sizeof (mp_limb_t))) +#define M * 1000000 + +#ifndef CLOCK +#error "Don't know CLOCK of your machine" +#endif + +#ifndef OPS +#define OPS (CLOCK/5) +#endif +#ifndef SIZE +#define SIZE 496 +#endif +#ifndef TIMES +#define TIMES OPS/(SIZE+1) +#endif + +#if N == 2 +#define mpn_mul_N mpn_mul_2 +#elif N == 3 +#define mpn_mul_N mpn_mul_3 +#elif N == 4 +#define mpn_mul_N mpn_mul_4 +#elif N == 5 +#define mpn_mul_N mpn_mul_5 +#elif N == 6 +#define mpn_mul_N mpn_mul_6 +#elif N == 7 +#define mpn_mul_N mpn_mul_7 +#elif N == 8 +#define mpn_mul_N mpn_mul_8 +#endif + +mp_limb_t +refmpn_mul_N (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp) +{ + int i; + rp[n] = mpn_mul_1 (rp, up, n, *vp); + rp++; + vp++; + for (i = 2; i < N; i++) + { + rp[n] = mpn_addmul_1 (rp, up, n, *vp); + rp++; + vp++; + } + return mpn_addmul_1 (rp, up, n, *vp); +} + +int +main (int argc, char **argv) +{ + mp_limb_t up[SIZE]; + mp_limb_t ref[SIZE + N - 1]; + mp_limb_t mem[SIZE + N + 1]; + mp_ptr rp = mem + 1; + mp_limb_t vp[N]; + mp_limb_t cy_ref, cy_try; + int i; +#if TIMES != 1 + long t0, t; + double cyc; +#endif + unsigned test; + mp_size_t size; + unsigned ntests; + + ntests = ~(unsigned) 0; + if (argc == 2) + ntests = strtol (argv[1], 0, 0); + + for (test = 1; test <= ntests; test++) + { +#if TIMES == 1 && ! defined (PRINT) + if (test % (CLOCK / SIZE / 1000) == 0) + { + printf ("\r%u", test); + fflush (stdout); + } +#endif + +#ifdef RANDOM + size = random () % (SIZE - N + 1) + N; +#else + size = SIZE; +#endif + + rp[size + N - 1] = 0x12345678; + rp[-1] = 0x87654321; + + mpn_random (vp, N); + +#if TIMES != 1 /* run timing tests unless asked not to */ + mpn_random (up, size); + + MPN_COPY (ref, rp, size + N - 1); + t0 = cputime(); + for (i = 0; i < TIMES; i++) + mpn_mul_N (ref, up, size, vp); + t = cputime() - t0; + cyc = ((double) t * CLOCK) / (TIMES * size * 1000.0) / N; + printf ("mpn_mul_%d: %5ldms (%.3f cycles/limb) [%.2f Gb/s]\n", + N, t, cyc, CLOCK/cyc*GMP_LIMB_BITS*GMP_LIMB_BITS/1e9); +#endif + +#ifdef PLAIN_RANDOM +#define MPN_RANDOM mpn_random +#else +#define MPN_RANDOM mpn_random2 +#endif + +#ifdef ZEROu + MPN_ZERO (up, size); +#else + MPN_RANDOM (up, size); +#endif + MPN_RANDOM (vp, N); + /* vp[0] = vp[1] = vp[2] = vp[3] = vp[4] = vp[5] = 0; */ + MPN_RANDOM (rp, size + N - 1); + +#if defined (PRINT) || defined (PRINTV) + printf ("vp="); + mpn_print (vp, N); +#endif +#ifdef PRINT + printf ("%*s ", 3 + N * LXW, ""); + mpn_print (up, size); +#endif + + MPN_COPY (ref, rp, size + N - 1); + cy_ref = refmpn_mul_N (ref, up, size, vp); + cy_try = mpn_mul_N (rp, up, size, vp); + +#ifdef PRINT + printf ("%*lX ", LXW, cy_ref); + mpn_print (ref, size + N - 1); + printf ("%*lX ", LXW, cy_try); + mpn_print (rp, size + N - 1); +#endif + +#ifndef NOCHECK + if (cy_ref != cy_try || mpn_cmp (ref, rp, size + N - 1) != 0 +// if (cy_ref != cy_try || mpn_cmp (ref + 5, rp + 5, size + N - 1 - 6) != 0 + || rp[size + N - 1] != 0x12345678 || rp[-1] != 0x87654321) + { + printf ("\n ref%*s try%*s diff\n", LXW - 3, "", 2 * LXW - 6, ""); + for (i = 0; i < size + N - 1; i++) + { + printf ("%6d: ", i); + printf ("%0*llX ", LXW, (unsigned long long) ref[i]); + printf ("%0*llX ", LXW, (unsigned long long) rp[i]); + print_posneg (rp[i] - ref[i]); + printf ("\n"); + } + printf ("retval: "); + printf ("%0*llX ", LXW, (unsigned long long) cy_ref); + printf ("%0*llX ", LXW, (unsigned long long) cy_try); + print_posneg (cy_try - cy_ref); + printf ("\n"); + if (rp[-1] != 0x87654321) + printf ("clobbered at low end\n"); + if (rp[size + N - 1] != 0x12345678) + printf ("clobbered at high end\n"); + printf ("TEST NUMBER %u\n", test); + abort(); + } +#endif + } + exit (0); +} + +#ifndef NOCHECK +static void +print_posneg (mp_limb_t d) +{ + char buf[LXW + 2]; + if (d == 0) + printf (" %*X", LXW, 0); + else if (-d < d) + { + sprintf (buf, "%llX", (unsigned long long) -d); + printf ("%*s-%s", LXW - (int) strlen (buf), "", buf); + } + else + { + sprintf (buf, "%llX", (unsigned long long) d); + printf ("%*s+%s", LXW - (int) strlen (buf), "", buf); + } +} +#endif + +#ifdef PRINT +static void +mpn_print (mp_ptr p, mp_size_t size) +{ + mp_size_t i; + + for (i = size - 1; i >= 0; i--) + { +#ifdef _LONG_LONG_LIMB + printf ("%0*lX%0*lX", (int) (sizeof(mp_limb_t)), + (unsigned long) (p[i] >> (GMP_LIMB_BITS/2)), + (int) (sizeof(mp_limb_t)), (unsigned long) (p[i])); +#else + printf ("%0*lX", LXW, p[i]); +#endif +#ifdef SPACE + if (i != 0) + printf (" "); +#endif + } + puts (""); +} +#endif diff --git a/gcc/gmp/tests/devel/primes.c b/gcc/gmp/tests/devel/primes.c new file mode 100644 index 0000000..84b3b51 100644 --- /dev/null +++ b/gcc/gmp/tests/devel/primes.c @@ -1,0 +1,341 @@ +/* +Copyright 2018-2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +/* Usage: + + ./primes [p|c] [n0] + + Checks mpz_probab_prime_p(n, r) exhaustively, starting from n=n0 + up to nMax. + If n0 * n0 > nMax, the intervall is sieved piecewise, else the + full intervall [0..nMax] is sieved at once. + With the parameter "p" (or nothing), tests all numbers. With "c" + only composites are tested. + + ./primes n [n0] + + Checks mpz_nextprime() exhaustively, starting from n=n0 up to + nMax. + + WARNING: The full intervall [0..nMax] is sieved at once, even if + only a piece is needed. This may require a lot of memory! + + */ + +#include +#include +#include "gmp-impl.h" +#include "longlong.h" +#include "tests.h" +#define STOP(x) return (x) +/* #define STOP(x) x */ +#define REPS 10 +/* #define TRACE(x,n) if ((n)>1) {x;} */ +#define TRACE(x,n) + +/* The full primesieve.c is included, just for block_resieve, that + is not exported ... */ +#undef gmp_primesieve +#include "../../primesieve.c" + +#ifndef BLOCK_SIZE +#define BLOCK_SIZE 2048 +#endif + +/*********************************************************/ +/* Section sieve: sieving functions and tools for primes */ +/*********************************************************/ + +static mp_size_t +primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } + +/*************************************************************/ +/* Section macros: common macros, for swing/fac/bin (&sieve) */ +/*************************************************************/ + +#define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \ + __max_i = (end); \ + \ + do { \ + ++__i; \ + if (((sieve)[__index] & __mask) == 0) \ + { \ + mp_limb_t prime; \ + prime = id_to_n(__i) + +#define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \ + do { \ + mp_limb_t __mask, __index, __max_i, __i; \ + \ + __i = (start)-(off); \ + __index = __i / GMP_LIMB_BITS; \ + __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \ + __i += (off); \ + \ + LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) + +#define LOOP_ON_SIEVE_STOP \ + } \ + __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \ + __index += __mask & 1; \ + } while (__i <= __max_i) + +#define LOOP_ON_SIEVE_END \ + LOOP_ON_SIEVE_STOP; \ + } while (0) + +mpz_t g; + +int something_wrong (mpz_t er, int exp) +{ + fprintf (stderr, "value = %lu , expected = %i\n", mpz_get_ui (er), exp); + return -1; +} + +int +check_pprime (unsigned long begin, unsigned long end, int composites) +{ + begin = (begin / 6U) * 6U; + for (;(begin < 2) & (begin <= end); ++begin) + { + *(g->_mp_d) = begin; + TRACE(printf ("-%li ", begin),1); + if (mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 0)); + } + for (;(begin < 4) & (begin <= end); ++begin) + { + *(g->_mp_d) = begin; + TRACE(printf ("+%li ", begin),2); + if (!composites && !mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 1)); + } + if (end > 4) { + if ((end > 10000) && (begin > end / begin)) + { + mp_limb_t *sieve, *primes; + mp_size_t size_s, size_p, off; + unsigned long start; + + mpz_set_ui (g, end); + mpz_sqrt (g, g); + start = mpz_get_ui (g) + GMP_LIMB_BITS; + size_p = primesieve_size (start); + + primes = __GMP_ALLOCATE_FUNC_LIMBS (size_p); + gmp_primesieve (primes, start); + + size_s = BLOCK_SIZE * 2; + sieve = __GMP_ALLOCATE_FUNC_LIMBS (size_s); + off = n_to_bit(begin) + (begin % 3 == 0); + + do { + TRACE (printf ("off =%li\n", off),3); + block_resieve (sieve, BLOCK_SIZE, off, primes); + TRACE (printf ("LOOP =%li - %li\n", id_to_n (off+1), id_to_n (off + BLOCK_SIZE * GMP_LIMB_BITS)),3); + LOOP_ON_SIEVE_BEGIN (prime, off, off + BLOCK_SIZE * GMP_LIMB_BITS - 1, + off, sieve); + + do { + *(g->_mp_d) = begin; + TRACE(printf ("-%li ", begin),1); + if (mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 0)); + if ((begin & 0xff) == 0) + { + spinner(); + if ((begin & 0xfffffff) == 0) + printf ("%li (0x%lx)\n", begin, begin); + } + } while (++begin < prime); + + *(g->_mp_d) = begin; + TRACE(printf ("+%li ", begin),2); + if (!composites && ! mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 1)); + ++begin; + + LOOP_ON_SIEVE_END; + off += BLOCK_SIZE * GMP_LIMB_BITS; + } while (begin < end); + + __GMP_FREE_FUNC_LIMBS (sieve, size_s); + __GMP_FREE_FUNC_LIMBS (primes, size_p); + } + else + { + mp_limb_t *sieve; + mp_size_t size; + unsigned long start; + + size = primesieve_size (end); + + sieve = __GMP_ALLOCATE_FUNC_LIMBS (size); + gmp_primesieve (sieve, end); + start = MAX (begin, 5) | 1; + LOOP_ON_SIEVE_BEGIN (prime, n_to_bit(start) + (start % 3 == 0), + n_to_bit (end), 0, sieve); + + do { + *(g->_mp_d) = begin; + TRACE(printf ("-%li ", begin),1); + if (mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 0)); + if ((begin & 0xff) == 0) + { + spinner(); + if ((begin & 0xfffffff) == 0) + printf ("%li (0x%lx)\n", begin, begin); + } + } while (++begin < prime); + + *(g->_mp_d) = begin; + TRACE(printf ("+%li ", begin),2); + if (!composites && ! mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 1)); + ++begin; + + LOOP_ON_SIEVE_END; + + __GMP_FREE_FUNC_LIMBS (sieve, size); + } + } + + for (;begin < end; ++begin) + { + *(g->_mp_d) = begin; + TRACE(printf ("-%li ", begin),1); + if (mpz_probab_prime_p (g, REPS)) + STOP (something_wrong (g, 0)); + } + + gmp_printf ("%Zd\n", g); + return 0; +} + +int +check_nprime (unsigned long begin, unsigned long end) +{ + if (begin < 2) + { + *(g->_mp_d) = begin; + TRACE(printf ("%li ", begin),1); + mpz_nextprime (g, g); + if (mpz_cmp_ui (g, 2) != 0) + STOP (something_wrong (g, -1)); + begin = mpz_get_ui (g); + } + if (begin < 3) + { + *(g->_mp_d) = begin; + TRACE(printf ("%li ", begin),1); + mpz_nextprime (g, g); + if (mpz_cmp_ui (g, 3) != 0) + STOP (something_wrong (g, -1)); + begin = mpz_get_ui (g); + } + if (end > 4) + { + mp_limb_t *sieve; + mp_size_t size; + unsigned long start; + + size = primesieve_size (end); + + sieve = __GMP_ALLOCATE_FUNC_LIMBS (size); + gmp_primesieve (sieve, end); + start = MAX (begin, 5) | 1; + *(g->_mp_d) = begin; + LOOP_ON_SIEVE_BEGIN (prime, n_to_bit(start) + (start % 3 == 0), + n_to_bit (end), 0, sieve); + + mpz_nextprime (g, g); + if (mpz_cmp_ui (g, prime) != 0) + STOP (something_wrong (g, -1)); + + if (prime - start > 200) + { + start = prime; + spinner(); + if (prime - begin > 0xfffffff) + { + begin = prime; + printf ("%li (0x%lx)\n", begin, begin); + } + } + + LOOP_ON_SIEVE_END; + + __GMP_FREE_FUNC_LIMBS (sieve, size); + } + + if (mpz_cmp_ui (g, end) < 0) + { + mpz_nextprime (g, g); + if (mpz_cmp_ui (g, end) <= 0) + STOP (something_wrong (g, -1)); + } + + gmp_printf ("%Zd\n", g); + return 0; +} + +int +main (int argc, char **argv) +{ + int ret, mode = 0; + unsigned long begin = 0, end = 0; + + for (;argc > 1;--argc,++argv) + switch (*argv[1]) { + case 'p': + mode = 0; + break; + case 'c': + mode = 2; + break; + case 'n': + mode = 1; + break; + default: + begin = end; + end = atol (argv[1]); + } + + if (begin >= end) + { + fprintf (stderr, "usage: primes [n|p|c] [n0] \n"); + exit (1); + } + + mpz_init_set_ui (g, ULONG_MAX); + + switch (mode) { + case 1: + ret = check_nprime (begin, end); + break; + default: + ret = check_pprime (begin, end, mode); + } + + mpz_clear (g); + + if (ret == 0) + printf ("Prime tests checked in [%lu - %lu] [0x%lx - 0x%lx].\n", begin, end, begin, end); + return ret; +} diff --git a/gcc/gmp/tests/devel/sqrtrem_1_2.c b/gcc/gmp/tests/devel/sqrtrem_1_2.c new file mode 100644 index 0000000..3951191 100644 --- /dev/null +++ b/gcc/gmp/tests/devel/sqrtrem_1_2.c @@ -1,0 +1,401 @@ +/* +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +/* Usage: + + ./sqrtrem_1_2 x + + Checks mpn_sqrtrem() exhaustively, starting from 0, incrementing + the operand by a single unit, until all values handled by + mpn_sqrtrem{1,2} are tested. SLOW. + + ./sqrtrem_1_2 s 1 + + Checks some special cases for mpn_sqrtrem(). I.e. values of the form + 2^k*i and 2^k*(i+1)-1, with k=2^n and 0 +#include +#include "gmp-impl.h" +#include "longlong.h" +#include "tests.h" +#define STOP(x) return (x) +/* #define STOP(x) x */ +#define SPINNER(v) \ + do { \ + MPN_SIZEINBASE_2EXP (spinner_count, q, v, 1); \ + --spinner_count; \ + spinner(); \ + } while (0) + +int something_wrong (mp_limb_t er, mp_limb_t ec, mp_limb_t es) +{ + fprintf (stderr, "root = %lu , rem = {%lu , %lu}\n", (long unsigned) es,(long unsigned) ec,(long unsigned) er); + return -1; +} + +int +check_all_values (int justone, int quick) +{ + mp_limb_t es, mer, er, s[1], r[2], q[2]; + mp_size_t x; + unsigned bits; + + es=1; + if (quick) { + printf ("Quick, skipping some... (%u)\n", GMP_NUMB_BITS - 2); + es <<= GMP_NUMB_BITS / 2 - 1; + } + er=0; + mer= es << 1; + *q = es * es; + printf ("All values tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 1); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (UNLIKELY (er == mer)) { + ++es; + if (UNLIKELY ((es & 0xff) == 0)) + SPINNER(1); + mer +=2; /* mer = es * 2 */ + er = 0; + } else + ++er; + ++*q; + } while (*q != 0); + q[1] = 1; + SPINNER(2); + printf ("\nValues of a single limb, tested.\n"); + if (justone) return 0; + printf ("All values tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 2); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (UNLIKELY (er == mer)) { + ++es; + if (UNLIKELY ((es & 0x7f) == 0)) + SPINNER(2); + mer +=2; /* mer = es * 2 */ + if (UNLIKELY (mer == 0)) + break; + er = 0; + } else + ++er; + q[1] += (++*q == 0); + } while (1); + SPINNER(2); + printf ("\nValues with at most a limb for reminder, tested.\n"); + printf ("Testing more values not supported, jet.\n"); + return 0; +} + +mp_limb_t +upd (mp_limb_t *s, mp_limb_t k) +{ + mp_limb_t _s = *s; + + while (k > _s * 2) + { + k -= _s * 2 + 1; + ++_s; + } + *s = _s; + return k; +} + +mp_limb_t +upd1 (mp_limb_t *s, mp_limb_t k) +{ + mp_limb_t _s = *s; + + if (LIKELY (k < _s * 2)) return k + 1; + *s = _s + 1; + return k - _s * 2; +} + +int +check_some_values (int justone, int quick) +{ + mp_limb_t es, her, er, k, s[1], r[2], q[2]; + mp_size_t x; + unsigned bits; + + es = 1 << 1; + if (quick) { + es <<= GMP_NUMB_BITS / 4 - 1; + printf ("Quick, skipping some... (%u)\n", GMP_NUMB_BITS / 2); + } + er = 0; + *q = es * es; + printf ("High-half values tested, up to bits:\n"); + do { + k = *q - 1; + do { + x = mpn_sqrtrem (s, r, q, 1); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (UNLIKELY ((es & 0xffff) == 0)) + SPINNER(1); + if ((*q & k) == 0) { + *q |= k; + er = upd (&es, k + er); + } else { + ++*q; + er = upd1 (&es, er); + } + } while (es & k); + } while (*q != 0); + q[1] = 1; + SPINNER(2); + printf ("\nValues of a single limb, tested.\n"); + if (justone) return 0; + if (quick) { + es <<= GMP_NUMB_BITS / 2 - 1; + q[1] <<= GMP_NUMB_BITS - 2; + printf ("Quick, skipping some... (%u)\n", GMP_NUMB_BITS - 2); + } + printf ("High-half values tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 2); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (*q == 0) { + *q = GMP_NUMB_MAX; + if (UNLIKELY ((es & 0xffff) == 0)) { + if (UNLIKELY (es == GMP_NUMB_HIGHBIT)) + break; + SPINNER(2); + } + /* er = er + GMP_NUMB_MAX - 1 - es*2 // postponed */ + ++es; + /* er = er + GMP_NUMB_MAX - 1 - 2*(es-1) = + = er +(GMP_NUMB_MAX + 1)- 2* es = er - 2*es */ + er = upd (&es, er - 2 * es); + } else { + *q = 0; + ++q[1]; + er = upd1 (&es, er); + } + } while (1); + SPINNER(2); + printf ("\nValues with at most a limb for reminder, tested.\n"); + er = GMP_NUMB_MAX; her = 0; + + printf ("High-half values tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 2); + if (UNLIKELY (x != (her?2:(er != 0))) || UNLIKELY (*s != es) + || UNLIKELY ((x != 0) && ((er != *r) || ((x == 2) && (r[1] != 1))))) + STOP (something_wrong (er, her, es)); + + if (*q == 0) { + *q = GMP_NUMB_MAX; + if (UNLIKELY ((es & 0xffff) == 0)) { + SPINNER(2); + } + if (her) { + ++es; + her = 0; + er = er - 2 * es; + } else { + her = --er != GMP_NUMB_MAX; + if (her & (er > es * 2)) { + er -= es * 2 + 1; + her = 0; + ++es; + } + } + } else { + *q = 0; + if (++q[1] == 0) break; + if ((her == 0) | (er < es * 2)) { + her += ++er == 0; + } else { + er -= es * 2; + her = 0; + ++es; + } + } + } while (1); + printf ("| %u\nValues of at most two limbs, tested.\n", GMP_NUMB_BITS*2); + return 0; +} + +int +check_corner_cases (int justone, int quick) +{ + mp_limb_t es, er, s[1], r[2], q[2]; + mp_size_t x; + unsigned bits; + + es = 1; + if (quick) { + es <<= GMP_NUMB_BITS / 2 - 1; + printf ("Quick, skipping some... (%u)\n", GMP_NUMB_BITS - 2); + } + er = 0; + *q = es*es; + printf ("Corner cases tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 1); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (er != 0) { + ++es; + if (UNLIKELY ((es & 0xffff) == 0)) + SPINNER(1); + er = 0; + ++*q; + } else { + er = es * 2; + *q += er; + } + } while (*q != 0); + q[1] = 1; + SPINNER(2); + printf ("\nValues of a single limb, tested.\n"); + if (justone) return 0; + if (quick) { + es <<= GMP_NUMB_BITS / 2 - 1; + q[1] <<= GMP_NUMB_BITS - 2; + printf ("Quick, skipping some... (%u)\n", GMP_NUMB_BITS - 2); + --es; + --q[1]; + q[0] -= es*2+1; + } + printf ("Corner cases tested, up to bits:\n"); + do { + x = mpn_sqrtrem (s, r, q, 2); + if (UNLIKELY (x != (er != 0)) || UNLIKELY (*s != es) + || UNLIKELY ((x == 1) && (er != *r))) + STOP (something_wrong (er, 0, es)); + + if (er != 0) { + ++es; + if (UNLIKELY ((es & 0xff) == 0)) + SPINNER(2); + er = 0; + q[1] += (++*q == 0); + if (UNLIKELY (es == GMP_NUMB_HIGHBIT)) + break; + } else { + er = es * 2; + add_ssaaaa (q[1], *q, q[1], *q, 0, er); + } + } while (1); + SPINNER(2); + printf ("\nValues with at most a limb for reminder, tested.\nCorner cases tested, up to bits:\n"); + x = mpn_sqrtrem (s, r, q, 2); + if ((*s != es) || (x != 0)) + STOP (something_wrong (0, 0, es)); + q[1] += 1; + x = mpn_sqrtrem (s, r, q, 2); + if ((*s != es) || (x != 2) || (*r != 0) || (r[1] != 1)) + STOP (something_wrong (0, 1, es)); + ++es; + q[1] += (++*q == 0); + do { + x = mpn_sqrtrem (s, r, q, 2); + if (UNLIKELY (x != (er != 0) * 2) || UNLIKELY (*s != es) + || UNLIKELY ((x == 2) && ((er != *r) || (r[1] != 1)))) + STOP (something_wrong (er, er != 0, es)); + + if (er != 0) { + ++es; + if (UNLIKELY (es == 0)) + break; + if (UNLIKELY ((es & 0xff) == 0)) + SPINNER(2); + er = 0; + q[1] += (++*q == 0); + } else { + er = es * 2; + add_ssaaaa (q[1], *q, q[1], *q, 1, er); + } + } while (1); + printf ("| %u\nValues of at most two limbs, tested.\n", GMP_NUMB_BITS*2); + return 0; +} + +int +main (int argc, char **argv) +{ + int mode = 0; + int justone = 0; + int quick = 0; + + for (;argc > 1;--argc,++argv) + switch (*argv[1]) { + default: + fprintf (stderr, "usage: sqrtrem_1_2 [x|c|s] [1|2] [q]\n"); + exit (1); + case 'x': + mode = 0; + break; + case 'c': + mode = 1; + break; + case 's': + mode = 2; + break; + case 'q': + quick = 1; + break; + case '1': + justone = 1; + break; + case '2': + justone = 0; + } + + switch (mode) { + default: + return check_all_values (justone, quick); + case 1: + return check_corner_cases (justone, quick); + case 2: + return check_some_values (justone, quick); + } +} diff --git a/gcc/gmp/tests/mpn/t-fib2m.c b/gcc/gmp/tests/mpn/t-fib2m.c new file mode 100644 index 0000000..5ad3942 100644 --- /dev/null +++ b/gcc/gmp/tests/mpn/t-fib2m.c @@ -1,0 +1,344 @@ +/* Test mpn_fib2m. + +Copyright 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include + +#include "gmp-impl.h" +#include "tests.h" + +#define MAX_K_BITS 16 +#define MAX_K (1L << MAX_K_BITS) +#define MIN_K 1 + +#define MAX_MN 20 +#define MAX_KN 30 + +#define COUNT 200 + +static int +test_fib2_fib2m (int count, gmp_randstate_ptr rands) +{ + int test; + mp_ptr fk, fks1, fkm, fks1m, mp, qp; + mp_size_t mn, fn, size, max_mn; + TMP_DECL; + + size = MPN_FIB2_SIZE (MAX_K); + max_mn = size / 4 + 10; + ASSERT (max_mn < size); + + TMP_MARK; + fk = TMP_ALLOC_LIMBS (size); + fks1 = TMP_ALLOC_LIMBS (size); + qp = TMP_ALLOC_LIMBS (size); + mp = TMP_ALLOC_LIMBS (max_mn); + fkm = 1 + TMP_ALLOC_LIMBS (max_mn * 2 + 1 + 2); + fks1m = 1 + TMP_ALLOC_LIMBS (max_mn * 2 + 1 + 2); + + for (test = 1; test <= count; ++test) + { + mp_limb_t fk_before, fk_after, fk1_before, fk1_after; + int signflip; + unsigned long k; + + k = MIN_K + + gmp_urandomm_ui (rands, test < MAX_K_BITS ? + MAX_K >> test : (MAX_K - MIN_K)); + + fn = mpn_fib2_ui (fk, fks1, k); + do { + mn = gmp_urandomm_ui (rands, MAX_K) % (fn / 4 + 10); + } while (mn == 0); + ASSERT (mn <= max_mn); + mpn_random2 (mp, mn); + ASSERT (mp [mn - 1] != 0); + + if (fn >= mn) + { + mpn_tdiv_qr (qp, fk, 0, fk, fn, mp, mn); + mpn_tdiv_qr (qp, fks1, 0, fks1, fn, mp, mn); + } + else + { + MPN_ZERO (fk + fn, mn - fn); + MPN_ZERO (fks1 + fn, mn - fn); + } + + mpn_random2 (fkm - 1, 2*mn+1+2); + fk_before = fkm [-1]; + fk_after = fkm [2 * mn + 1]; + + mpn_random2 (fks1m - 1, 2*mn+1+2); + fk1_before = fks1m [-1]; + fk1_after = fks1m [2 * mn + 1]; + + qp [0] = k; + signflip = mpn_fib2m (fkm, fks1m, qp, 1, mp, mn); + if (fkm [-1] != fk_before || fkm [2 * mn + 1] != fk_after + || fks1m [-1] != fk1_before || fks1m [2 * mn + 1] != fk1_after) + { + printf ("REDZONE violation in test %d, k = %lu, mn = %u\n", + test, k, (unsigned) mn); + if (fkm[-1] != fk_before) + { + printf ("before fkm:"); mpn_dump (fkm - 1, 1); + printf ("keep: "); mpn_dump (&fk_before, 1); + } + if (fkm[2 * mn + 1] != fk_after) + { + printf ("after fkm:"); mpn_dump (fkm + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fk_after, 1); + } + if (fks1m[-1] != fk1_before) + { + printf ("before fks1m:"); mpn_dump (fks1m - 1, 1); + printf ("keep: "); mpn_dump (&fk1_before, 1); + } + if (fks1m[2 * mn + 1] != fk1_after) + { + printf ("after fks1m:"); mpn_dump (fks1m + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fk1_after, 1); + } + abort(); + } + + if (mpn_cmp (fkm, fk, mn) != 0) + { + if (mpn_sub_n (fk, mp, fk, mn) || mpn_cmp (fkm, fk, mn) != 0) + { + printf ("ERROR(k) in test %d, k = %lu, mn = %u\n", + test, k, (unsigned) mn); + mpn_dump (fk, mn); + mpn_dump (fkm, mn); + mpn_dump (mp, mn); + abort(); + } + signflip ^= 1; + } + + if (mpn_cmp (fks1m, fks1, mn) != 0) + { + if (mpn_sub_n (fks1, mp, fks1, mn) || mpn_cmp (fks1m, fks1, mn) != 0) + { + printf ("ERROR(k-1) in test %d, k = %lu, mn = %u\n", + test, k, (unsigned) mn); + mpn_dump (fks1, mn); + mpn_dump (fks1m, mn); + mpn_dump (mp, mn); + abort(); + } + signflip ^= 1; + } + + if (signflip != 0 && ! mpn_zero_p (fks1m, mn) && ! mpn_zero_p (fkm, mn)) + { + if ((mp [0] & 1) == 0) /* Should we test only odd modulus-es? */ + { + if (! mpn_lshift (fks1m, fks1m, mn, 1) && + mpn_cmp (mp, fks1m, mn) == 0) + continue; + if (! mpn_lshift (fkm, fkm, mn, 1) && + mpn_cmp (mp, fkm, mn) == 0) + continue; + } + printf ("ERROR(sign) in test %d, k = %lu, mn = %u\n", + test, k, (unsigned) mn); + abort(); + } + } + TMP_FREE; + return 0; +} + +static int +test_fib2m_2exp (int count, gmp_randstate_ptr rands) +{ + int test; + mp_ptr fka, fks1a, fkb, fks1b, mp, kp; + TMP_DECL; + + TMP_MARK; + kp = TMP_ALLOC_LIMBS (MAX_KN); + mp = TMP_ALLOC_LIMBS (MAX_MN); + fka = 1 + TMP_ALLOC_LIMBS (MAX_MN * 2 + 1 + 2); + fks1a = 1 + TMP_ALLOC_LIMBS (MAX_MN * 2 + 1 + 2); + fkb = 1 + TMP_ALLOC_LIMBS (MAX_MN * 2 + 1 + 2); + fks1b = 1 + TMP_ALLOC_LIMBS (MAX_MN * 2 + 1 + 2); + + for (test = 1; test <= count; ++test) + { + mp_limb_t fka_before, fka_after, fk1a_before, fk1a_after; + mp_limb_t fkb_before, fkb_after, fk1b_before, fk1b_after; + mp_size_t mn, kn; + int signflip; + mp_bitcnt_t exp2; + + mn = gmp_urandomm_ui (rands, MAX_MN - 1) + 1; + mpn_random2 (mp, mn); + + exp2 = MIN_K + 1 + gmp_urandomm_ui (rands, MAX_KN * GMP_NUMB_BITS - MIN_K - 1); + + kn = BITS_TO_LIMBS (exp2); + MPN_ZERO (kp, kn - 1); + kp [kn - 1] = CNST_LIMB (1) << ((exp2 - 1) % GMP_NUMB_BITS); + + mpn_random2 (fka - 1, 2*mn+1+2); + fka_before = fka [-1]; + fka_after = fka [2 * mn + 1]; + + mpn_random2 (fks1a - 1, 2*mn+1+2); + fk1a_before = fks1a [-1]; + fk1a_after = fks1a [2 * mn + 1]; + + signflip = mpn_fib2m (fka, fks1a, kp, kn, mp, mn); + if (fka [-1] != fka_before || fka [2 * mn + 1] != fka_after + || fks1a [-1] != fk1a_before || fks1a [2 * mn + 1] != fk1a_after) + { + printf ("REDZONE(a) violation in test %d, exp2 = %lu\n", test, exp2); + if (fka[-1] != fka_before) + { + printf ("before fka:"); mpn_dump (fka - 1, 1); + printf ("keep: "); mpn_dump (&fka_before, 1); + } + if (fka[2 * mn + 1] != fka_after) + { + printf ("after fka:"); mpn_dump (fka + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fka_after, 1); + } + if (fks1a[-1] != fk1a_before) + { + printf ("before fks1a:"); mpn_dump (fks1a - 1, 1); + printf ("keep: "); mpn_dump (&fk1a_before, 1); + } + if (fks1a[2 * mn + 1] != fk1a_after) + { + printf ("after fks1a:"); mpn_dump (fks1a + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fk1a_after, 1); + } + abort(); + } + + if (signflip && ! mpn_zero_p (fks1a, mn)) + mpn_sub_n (fks1a, mp, fks1a, mn); + if (mpn_sub_n (fka, fka, fks1a, mn)) + ASSERT_CARRY (mpn_add_n (fka, fka, mp, mn)); + + mpn_sub_1 (kp, kp, kn, 1); + ASSERT (exp2 % GMP_NUMB_BITS == 1 || kp [kn - 1] != 0); + kn -= kp [kn - 1] == 0; + + mpn_random2 (fkb - 1, 2*mn+1+2); + fkb_before = fkb [-1]; + fkb_after = fkb [2 * mn + 1]; + + mpn_random2 (fks1b - 1, 2*mn+1+2); + fk1b_before = fks1b [-1]; + fk1b_after = fks1b [2 * mn + 1]; + + signflip = mpn_fib2m (fkb, fks1b, kp, kn, mp, mn); + if (fkb [-1] != fkb_before || fkb [2 * mn + 1] != fkb_after + || fks1b [-1] != fk1b_before || fks1b [2 * mn + 1] != fk1b_after) + { + printf ("REDZONE(b) violation in test %d, exp2 = %lu\n", test, exp2); + if (fkb[-1] != fkb_before) + { + printf ("before fkb:"); mpn_dump (fkb - 1, 1); + printf ("keep: "); mpn_dump (&fkb_before, 1); + } + if (fkb[2 * mn + 1] != fkb_after) + { + printf ("after fkb:"); mpn_dump (fkb + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fkb_after, 1); + } + if (fks1b[-1] != fk1b_before) + { + printf ("before fks1b:"); mpn_dump (fks1b - 1, 1); + printf ("keep: "); mpn_dump (&fk1b_before, 1); + } + if (fks1b[2 * mn + 1] != fk1b_after) + { + printf ("after fks1b:"); mpn_dump (fks1b + 2 * mn + 1, 1); + printf ("keep: "); mpn_dump (&fk1b_after, 1); + } + abort(); + } + + if (mpn_cmp (fks1a, fkb, mn) != 0) + { + if (mpn_sub_n (fkb, mp, fkb, mn) || mpn_cmp (fks1a, fkb, mn) != 0) + { + printf ("ERROR(k) in test %d, exp2 = %lu\n", test, exp2); + mpn_dump (fks1a, mn); + mpn_dump (fkb, mn); + mpn_dump (mp, mn); + abort(); + } + signflip ^= 1; + } + + if (mpn_cmp (fka, fks1b, mn) != 0) + { + if (mpn_sub_n (fks1b, mp, fks1b, mn) || mpn_cmp (fka, fks1b, mn) != 0) + { + printf ("ERROR(k-1) in test %d, exp2 = %lu\n", test, exp2); + mpn_dump (fka, mn); + mpn_dump (fks1b, mn); + mpn_dump (mp, mn); + abort(); + } + signflip ^= 1; + } + + if (signflip != 0 && ! mpn_zero_p (fks1b, mn) && ! mpn_zero_p (fkb, mn)) + { + if ((mp [0] & 1) == 0) /* Should we test only odd modulus-es? */ + { + if (! mpn_lshift (fks1b, fks1b, mn, 1) && + mpn_cmp (mp, fks1b, mn) == 0) + continue; + if (! mpn_lshift (fkb, fkb, mn, 1) && + mpn_cmp (mp, fkb, mn) == 0) + continue; + } + printf ("ERROR(sign) in test %d, exp2 = %lu\n", + test, exp2); + abort(); + } + } + TMP_FREE; + return 0; +} + +int +main (int argc, char **argv) +{ + int count = COUNT; + gmp_randstate_ptr rands; + + tests_start (); + TESTS_REPS (count, argv, argc); + rands = RANDS; + + test_fib2_fib2m (count / 2, rands); + test_fib2m_2exp (count / 2, rands); + + tests_end (); + exit (0); +} diff --git a/gcc/gmp/tests/mpn/t-gcd_11.c b/gcc/gmp/tests/mpn/t-gcd_11.c new file mode 100644 index 0000000..14226b0 100644 --- /dev/null +++ b/gcc/gmp/tests/mpn/t-gcd_11.c @@ -1,0 +1,83 @@ +/* Test mpn_gcd_11. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include + +#include "gmp-impl.h" +#include "tests.h" + +#ifndef COUNT +#define COUNT 500000 +#endif + +static void +one_test (mp_limb_t a, mp_limb_t b, mp_limb_t ref) +{ + mp_limb_t r = mpn_gcd_11 (a, b); + if (r != ref) + { + gmp_fprintf (stderr, + "gcd_11 (0x%Mx, 0x%Mx) failed, got: 0x%Mx, ref: 0x%Mx\n", + a, b, r, ref); + abort(); + } +} + +int +main (int argc, char **argv) +{ + mpz_t a, b; + int count = COUNT; + int test; + gmp_randstate_ptr rands; + + TESTS_REPS (count, argv, argc); + + tests_start (); + rands = RANDS; + + mpz_init (a); + mpz_init (b); + for (test = 0; test < count; test++) + { + mp_limb_t al, bl; + mp_bitcnt_t asize = 1 + gmp_urandomm_ui(rands, GMP_NUMB_BITS); + mp_bitcnt_t bsize = 1 + gmp_urandomm_ui(rands, GMP_NUMB_BITS); + if (test & 1) + { + mpz_urandomb (a, rands, asize); + mpz_urandomb (b, rands, bsize); + } + else + { + mpz_rrandomb (a, rands, asize); + mpz_rrandomb (b, rands, bsize); + } + + mpz_setbit (a, 0); + mpz_setbit (b, 0); + al = mpz_getlimbn (a, 0); + bl = mpz_getlimbn (b, 0); + one_test (al, bl, refmpn_gcd_11 (al, bl)); + } + + mpz_clear (a); + mpz_clear (b); +} diff --git a/gcc/gmp/tests/mpn/t-gcd_22.c b/gcc/gmp/tests/mpn/t-gcd_22.c new file mode 100644 index 0000000..baed35a 100644 --- /dev/null +++ b/gcc/gmp/tests/mpn/t-gcd_22.c @@ -1,0 +1,84 @@ +/* Test mpn_gcd_22. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include + +#include "gmp-impl.h" +#include "tests.h" + +#ifndef COUNT +#define COUNT 150000 +#endif + +static void +one_test (mpz_srcptr a, mpz_srcptr b, mpz_srcptr ref) +{ + mp_double_limb_t r = mpn_gcd_22 (mpz_getlimbn (a, 1), mpz_getlimbn (a, 0), + mpz_getlimbn (b, 1), mpz_getlimbn (b, 0)); + if (r.d0 != mpz_getlimbn (ref, 0) || r.d1 != mpz_getlimbn (ref, 1)) + { + gmp_fprintf (stderr, + "gcd_22 (0x%Zx, 0x%Zx) failed, got: g1 = 0x%Mx g0 = %Mx, ref: 0x%Zx\n", + a, b, r.d1, r.d0, ref); + abort(); + } +} + +int +main (int argc, char **argv) +{ + mpz_t a, b, ref; + int count = COUNT; + int test; + gmp_randstate_ptr rands; + + TESTS_REPS (count, argv, argc); + + tests_start (); + rands = RANDS; + + mpz_init (a); + mpz_init (b); + mpz_init (ref); + for (test = 0; test < count; test++) + { + mp_bitcnt_t asize = 1 + gmp_urandomm_ui(rands, 2*GMP_NUMB_BITS); + mp_bitcnt_t bsize = 1 + gmp_urandomm_ui(rands, 2*GMP_NUMB_BITS); + if (test & 1) + { + mpz_urandomb (a, rands, asize); + mpz_urandomb (b, rands, bsize); + } + else + { + mpz_rrandomb (a, rands, asize); + mpz_rrandomb (b, rands, bsize); + } + + mpz_setbit (a, 0); + mpz_setbit (b, 0); + refmpz_gcd (ref, a, b); + one_test (a, b, ref); + } + + mpz_clear (a); + mpz_clear (b); + mpz_clear (ref); +} diff --git a/gcc/gmp/tests/mpn/t-gcdext_1.c b/gcc/gmp/tests/mpn/t-gcdext_1.c new file mode 100644 index 0000000..99143ae 100644 --- /dev/null +++ b/gcc/gmp/tests/mpn/t-gcdext_1.c @@ -1,0 +1,131 @@ +/* Test mpn_gcdext_1. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include + +#include "gmp-impl.h" +#include "tests.h" + +#ifndef COUNT +#define COUNT 250000 +#endif + +static void +set_signed_limb (mpz_t r, mp_limb_signed_t x) +{ + mpz_t t; + mp_limb_t abs_x = ABS_CAST(mp_limb_t, x); + mpz_set (r, mpz_roinit_n (t, &abs_x, 1)); + if (x < 0) + mpz_neg (r, r); +} + +static void +one_test (mp_limb_t a, mp_limb_t b) +{ + mp_limb_signed_t s, t; + mp_limb_t g; + + g = mpn_gcdext_1 (&s, &t, a, b); + + if (g > 0) + { + mpz_t d, sz, tz, tmp; + + mpz_init (d); + mpz_init (sz); + mpz_init (tz); + + set_signed_limb (sz, s); + set_signed_limb (tz, t); + + mpz_mul (d, mpz_roinit_n (tmp, &a, 1), sz); + mpz_addmul (d, mpz_roinit_n (tmp, &b, 1), tz); + + if (mpz_cmp (d, mpz_roinit_n (tmp, &g, 1)) == 0 + && a % g == 0 && b % g == 0) + { + mp_limb_t a_div_g = a / g; + mp_limb_t b_div_g = b / g; + mp_limb_t abs_s = ABS_CAST(mp_limb_t, s); + mp_limb_t abs_t = ABS_CAST(mp_limb_t, t); + mpz_mul_ui (sz, sz, 2); + mpz_mul_ui (tz, tz, 2); + if ((abs_s == 1 || mpz_cmpabs (sz, mpz_roinit_n (tmp, &b_div_g, 1)) < 0) + && (abs_t == 1 || mpz_cmpabs (tz, mpz_roinit_n (tmp, &a_div_g, 1)) < 0)) + { + mpz_clear (d); + mpz_clear (sz); + mpz_clear (tz); + + return; + } + } + } + gmp_fprintf (stderr, + "gcdext_1 (0x%Mx, 0x%Mx) failed, got: g = 0x%Mx, s = %s0x%Mx, t = %s0x%Mx\n", + a, b, g, + s < 0 ? "-" : "", ABS_CAST(mp_limb_t, s), + t < 0 ? "-" : "", ABS_CAST(mp_limb_t, t)); + abort(); +} + +int +main (int argc, char **argv) +{ + mpz_t a, b; + int count = COUNT; + int test; + gmp_randstate_ptr rands; + + TESTS_REPS (count, argv, argc); + + tests_start (); + rands = RANDS; + + mpz_init (a); + mpz_init (b); + for (test = 0; test < count; test++) + { + mp_limb_t al, bl; + mp_bitcnt_t asize = 1 + gmp_urandomm_ui(rands, GMP_NUMB_BITS); + mp_bitcnt_t bsize = 1 + gmp_urandomm_ui(rands, GMP_NUMB_BITS); + if (test & 1) + { + mpz_urandomb (a, rands, asize); + mpz_urandomb (b, rands, bsize); + } + else + { + mpz_rrandomb (a, rands, asize); + mpz_rrandomb (b, rands, bsize); + } + + al = mpz_getlimbn (a, 0); + bl = mpz_getlimbn (b, 0); + al += (al == 0); + bl += (bl == 0); + + one_test (al, bl); + } + + mpz_clear (a); + mpz_clear (b); +} diff --git a/gcc/gmp/tests/mpz/t-lucm.c b/gcc/gmp/tests/mpz/t-lucm.c new file mode 100644 index 0000000..3b6dcd1 100644 --- /dev/null +++ b/gcc/gmp/tests/mpz/t-lucm.c @@ -1,0 +1,144 @@ +/* Test mpz_powm, mpz_lucas_mod. + +Copyright 1991, 1993, 1994, 1996, 1999-2001, 2009, 2012, 2018 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library test suite. + +The GNU MP Library test suite is free software; you can redistribute it +and/or modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 3 of the License, +or (at your option) any later version. + +The GNU MP Library test suite is distributed in the hope that it will be +useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +the GNU MP Library test suite. If not, see https://www.gnu.org/licenses/. */ + +#include +#include +#include + +#include "gmp-impl.h" +#include "tests.h" + +void debug_mp (mpz_t, int); + +#define SIZEM 8 + +/* FIXME: Should we implement another sequence to test lucas mod? */ +/* Eg: a generalisation of what we use for Fibonacci: */ +/* U_{2n-1} = U_n^2 - Q*U_{n-1}^2 */ +/* U_{2n+1} = D*U_n^2 + Q*U_{2n-1} + 2*Q^n ; whith D = (P^2-4*Q) */ +/* P*U_{2n} = U_{2n+1} + Q*U_{2n-1} */ + +int +main (int argc, char **argv) +{ + mpz_t base, exp, mod; + mpz_t r1, r2, t1, t2; + mp_size_t base_size, exp_size, mod_size; + int i, res; + int reps = 1000; + long Q; + gmp_randstate_ptr rands; + mpz_t bs; + unsigned long bsi, size_range; + + tests_start (); + TESTS_REPS (reps, argv, argc); + + rands = RANDS; + + mpz_init (bs); + + mpz_init (base); + mpz_init (exp); + mpz_init (mod); + mpz_init (r1); + mpz_init (r2); + mpz_init (t1); + mpz_init (t2); + + for (i = 0; i < reps; i++) + { + mpz_urandomb (bs, rands, 32); + size_range = mpz_get_ui (bs) % SIZEM + 1; + + do /* Loop until base >= 2 and fits in a long. */ + { + mpz_urandomb (base, rands, BITS_PER_ULONG - 2); + } + while (mpz_cmp_ui (base, 2) < 0 || mpz_fits_slong_p (base) == 0); + + Q = mpz_get_ui (base); + + do + { + ++size_range; + size_range = MIN (size_range, SIZEM); + mpz_urandomb (bs, rands, size_range); + mod_size = mpz_get_ui (bs); + mpz_rrandomb (mod, rands, mod_size); + mpz_add_ui (mod, mod, 16); + } + while (mpz_gcd_ui (NULL, mod, Q) != 1); + + mod_size = mpz_sizeinbase (mod, 2) - 3; + mpz_urandomb (bs, rands, 32); + exp_size = mpz_get_ui (bs) % mod_size + 2; + + mpz_tdiv_q_2exp (exp, mod, exp_size); + mpz_add_ui (exp, exp, 1); + + mpz_urandomb (bs, rands, 2); + bsi = mpz_get_ui (bs); + if ((bsi & 1) != 0) + { + mpz_neg (base, base); + Q = -Q; + } + + res = mpz_lucas_mod (t1, r2, Q, exp_size, mod, t2, r1); + if (res && ++reps) + continue; + MPZ_CHECK_FORMAT (r2); + if (mpz_cmp_ui (r2, 0) < 0) + mpz_add (r2, r2, mod); + mpz_powm (r1, base, exp, mod); + + if (mpz_cmp (r1, r2) != 0) + { + fprintf (stderr, "\nIncorrect results in test %d for operands:\n", i); + debug_mp (base, -16); + debug_mp (exp, -16); + debug_mp (mod, -16); + fprintf (stderr, "mpz_powm result:\n"); + debug_mp (r1, -16); + fprintf (stderr, "mpz_lucas_mod result (%d) Q=%ld:\n", res, Q); + debug_mp (r2, -16); + abort (); + } + } + + mpz_clear (bs); + mpz_clear (base); + mpz_clear (exp); + mpz_clear (mod); + mpz_clear (r1); + mpz_clear (r2); + mpz_clear (t1); + mpz_clear (t2); + + tests_end (); + exit (0); +} + +void +debug_mp (mpz_t x, int base) +{ + mpz_out_str (stderr, base, x); fputc ('\n', stderr); +} diff --git a/gcc/gmp/mpn/alpha/ev67/gcd_11.asm b/gcc/gmp/mpn/alpha/ev67/gcd_11.asm new file mode 100644 index 0000000..03c234b 100644 --- /dev/null +++ b/gcc/gmp/mpn/alpha/ev67/gcd_11.asm @@ -1,0 +1,79 @@ +dnl Alpha ev67 mpn_gcd_11 -- Nx1 greatest common divisor. + +dnl Copyright 2003, 2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C ev67: 3.4 cycles/bitpair for 1x1 part + + +C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); +C +C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and +C strip trailing zeros from abs(x-y) to maintain x and y both odd. +C +C The trailing zeros are calculated from just x-y, since in twos-complement +C there's the same number of trailing zeros on d or -d. This means the cttz +C runs in parallel with abs(x-y). +C +C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit +C operands with this algorithm gives the measured 3.4 c/l. +C +C The slottings shown are for SVR4 style systems, Unicos differs in the +C initial gp setup and the LEA. + + +ASM_START() +PROLOGUE(mpn_gcd_11) + mov r16, r0 + mov r17, r1 + + ALIGN(16) +L(top): subq r0, r1, r7 C l0 d = x - y + cmpult r0, r1, r16 C u0 test x >= y + + subq r1, r0, r4 C l0 new_x = y - x + cttz r7, r8 C U0 d twos + + cmoveq r16, r7, r4 C l0 new_x = d if x>=y + cmovne r16, r0, r1 C u0 y = x if x> twos + bne r7, L(top) C U1 stop when d==0 + + +L(end): mov r1, r0 C U0 return y << common_twos + ret r31, (r26), 1 C L0 +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/arm/v5/gcd_11.asm b/gcc/gmp/mpn/arm/v5/gcd_11.asm new file mode 100644 index 0000000..3c2b48f 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v5/gcd_11.asm @@ -1,0 +1,70 @@ +dnl ARM v5 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale ? +C Cortex-A5 6.45 obsolete +C Cortex-A7 6.41 obsolete +C Cortex-A8 5.0 obsolete +C Cortex-A9 5.9 obsolete +C Cortex-A15 4.40 obsolete +C Cortex-A17 5.68 obsolete +C Cortex-A53 4.37 obsolete +C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): sub r2, v0, u0 C 0,5 + and r12, r2, r3 C 1 + clz r12, r12 C 2 + rsb r12, r12, #31 C 3 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 4 + subs r3, u0, v0 C 5 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gcc/gmp/mpn/arm/v6t2/gcd_11.asm b/gcc/gmp/mpn/arm/v6t2/gcd_11.asm new file mode 100644 index 0000000..8a38351 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v6t2/gcd_11.asm @@ -1,0 +1,65 @@ +dnl ARM v6t2 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 5.2 +C Cortex-A7 5.04 +C Cortex-A8 3.59 +C Cortex-A9 9.5 +C Cortex-A15 3.2 +C Cortex-A17 5.25 +C Cortex-A53 3.57 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): rbit r12, r3 C 1,5 + clz r12, r12 C 2 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 3 + subs r3, u0, v0 C 4 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gcc/gmp/mpn/arm/v6t2/gcd_22.asm b/gcc/gmp/mpn/arm/v6t2/gcd_22.asm new file mode 100644 index 0000000..3b23808 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v6t2/gcd_22.asm @@ -1,0 +1,113 @@ +dnl ARM v6t2 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 10.1 +C Cortex-A7 9.1 +C Cortex-A8 6.3 +C Cortex-A9 ? +C Cortex-A12 7.7 +C Cortex-A15 5.7 +C Cortex-A17 ? +C Cortex-A53 7.0 + + +define(`gp', `r0') + +define(`u1', `r1') +define(`u0', `r2') +define(`v1', `r3') +define(`v0', `r4') + +define(`t0', `r5') +define(`t1', `r6') +define(`cnt', `r7') + +ASM_START() +PROLOGUE(mpn_gcd_22) + push { r4-r7 } + + ldr v0, [sp,#16] C + +L(top): subs t0, u0, v0 C 0 7 + beq L(lowz) + sbcs t1, u1, v1 C 1 8 + + rbit cnt, t0 C 1 + + negcc t0, t0 + mvncc t1, t1 +L(bck): movcc v0, u0 + movcc v1, u1 + + clz cnt, cnt C 2 + rsb r12, cnt, #32 C 3 + + lsr u0, t0, cnt C 3 + lsl r12, t1, r12 C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, r12 C 5 + + orrs r12, u1, v1 + bne L(top) + + + str r12, [gp,#4] C high result limb <= 0 + + mov r6, gp + mov r0, u0 C pass 1st argument + mov r1, v0 C pass 2nd argument + mov r7, r14 C preserve link register + bl mpn_gcd_11 + str r0, [r6,#0] + mov r14, r7 + pop { r4-r7 } + bx r14 + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + beq L(end) + mov t1, #0 + rbit cnt, t0 C 1 + negcc t0, t0 + b L(bck) + +L(end): str v0, [gp,#0] + str v1, [gp,#4] + pop { r4-r7 } + bx r14 +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/cora53/cnd_aors_n.asm b/gcc/gmp/mpn/arm64/cora53/cnd_aors_n.asm new file mode 100644 index 0000000..1b227da 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/cora53/cnd_aors_n.asm @@ -1,0 +1,99 @@ +dnl ARM64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Cortex-A53 3.5-4 +C Cortex-A57 2.25 +C X-Gene 3.5 + +changecom(blah) + +define(`cnd', `x0') +define(`rp', `x1') +define(`up', `x2') +define(`vp', `x3') +define(`n', `x4') + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn xzr, xzr') + define(`RETVAL', `cset x0, cs') + define(`func', mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp xzr, xzr') + define(`RETVAL', `cset x0, cc') + define(`func', mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + cmp cnd, #1 + sbc cnd, cnd, cnd + + CLRCY C really only needed for n = 0 (mod 4) + + tbz n, #0, L(1) + ldr x10, [up], #8 + ldr x12, [vp], #8 + bic x6, x12, cnd + ADDSUBC x8, x10, x6 + sub n, n, #1 + str x8, [rp], #8 + cbz n, L(rt) + +L(1): ldp x10, x11, [up], #16 + ldp x12, x13, [vp], #16 + sub n, n, #2 + cbz n, L(end) + +L(top): bic x6, x12, cnd + bic x7, x13, cnd + ldp x12, x13, [vp], #16 + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + ldp x10, x11, [up], #16 + sub n, n, #2 + stp x8, x9, [rp], #16 + cbnz n, L(top) + +L(end): bic x6, x12, cnd + bic x7, x13, cnd + ADDSUBC x8, x10, x6 + ADDSUBC x9, x11, x7 + stp x8, x9, [rp] +L(rt): RETVAL + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/arm64/cora53/gmp-mparam.h b/gcc/gmp/mpn/arm64/cora53/gmp-mparam.h new file mode 100644 index 0000000..f4e258d 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/cora53/gmp-mparam.h @@ -1,0 +1,242 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a53. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1536 MHz Cortex-A53 */ +/* FFT tuning limit = 21,583,800 */ +/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 2 /* 4.84% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 12 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 22 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 39.05% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 21 +#define DIV_QR_1_UNNORM_THRESHOLD 21 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 161 + +#define MUL_TOOM22_THRESHOLD 14 +#define MUL_TOOM33_THRESHOLD 49 +#define MUL_TOOM44_THRESHOLD 73 +#define MUL_TOOM6H_THRESHOLD 173 +#define MUL_TOOM8H_THRESHOLD 236 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 81 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 65 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 68 +#define SQR_TOOM4_THRESHOLD 183 +#define SQR_TOOM6_THRESHOLD 230 +#define SQR_TOOM8_THRESHOLD 357 + +#define MULMID_TOOM42_THRESHOLD 23 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 83,10}, { 47, 9}, { 99,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255,10}, { 71, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,10}, \ + { 87,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383,10}, { 103, 9}, { 207, 8}, { 415,10}, \ + { 111, 9}, { 223,12}, { 31,11}, { 63, 9}, \ + { 255, 8}, { 511,10}, { 135, 9}, { 287, 8}, \ + { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \ + { 639,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415, 8}, { 831,10}, { 223, 9}, \ + { 447,12}, { 63,10}, { 255, 9}, { 511, 8}, \ + { 1023, 9}, { 543,10}, { 287, 9}, { 575, 8}, \ + { 1151,11}, { 159,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351, 9}, { 703, 8}, { 1407,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,13}, { 63,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 575, 9}, { 1151,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703, 9}, \ + { 1407, 8}, { 2815,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959, 9}, \ + { 1919,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407, 9}, { 2815,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,10}, { 1791,12}, \ + { 479,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 703,11}, { 1407,10}, \ + { 2815,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1151,13}, { 703,12}, { 1407,11}, \ + { 2815,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,12}, { 1919,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,14}, { 639,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,14}, { 895,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 217 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 17, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 15, 7}, { 31, 8}, { 19, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95, 8}, { 191,10}, \ + { 55,11}, { 31,10}, { 63, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79, 9}, \ + { 159,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383, 7}, { 767,10}, { 103,12}, { 31,11}, \ + { 63, 9}, { 255, 8}, { 511, 7}, { 1023,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95,10}, { 191, 9}, { 383, 8}, \ + { 767,10}, { 207, 9}, { 415, 8}, { 831,10}, \ + { 223, 9}, { 447,12}, { 63,10}, { 255, 9}, \ + { 511, 8}, { 1023,11}, { 143,10}, { 287, 9}, \ + { 575, 8}, { 1151,11}, { 159,10}, { 319, 9}, \ + { 639,11}, { 175,10}, { 351, 9}, { 703,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,11}, \ + { 207,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,13}, { 63,11}, { 255,10}, { 511, 9}, \ + { 1023,11}, { 287,10}, { 575, 9}, { 1151,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,10}, { 959,12}, \ + { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \ + { 575,10}, { 1151,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,12}, { 479,11}, \ + { 959,10}, { 1919,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,13}, { 319,12}, { 703,11}, { 1407,10}, \ + { 2815,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1151,13}, { 703,12}, { 1407,11}, { 2815,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,13}, { 959,12}, { 1919,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1407,12}, { 2815,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,12}, { 4863,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 204 +#define SQR_FFT_THRESHOLD 2688 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 67 +#define SQRLO_SQR_THRESHOLD 5240 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 155 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 89 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 163 +#define INV_APPR_THRESHOLD 161 + +#define BINV_NEWTON_THRESHOLD 196 +#define REDC_1_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 998 +#define MU_DIVAPPR_Q_THRESHOLD 998 +#define MUPI_DIV_QR_THRESHOLD 91 +#define MU_BDIV_QR_THRESHOLD 807 +#define MU_BDIV_Q_THRESHOLD 924 + +#define POWM_SEC_TABLE 6,30,125,579,1730 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 802 +#define SET_STR_PRECOMPUTE_THRESHOLD 1815 + +#define FAC_DSC_THRESHOLD 258 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 10 +#define HGCD2_DIV1_METHOD 1 /* 7.05% faster than 3 */ +#define HGCD_THRESHOLD 107 +#define HGCD_APPR_THRESHOLD 112 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 324 +#define GCDEXT_DC_THRESHOLD 242 +#define JACOBI_BASE_METHOD 4 /* 22.41% faster than 1 */ + +/* Tuneup completed successfully, took 66624 seconds */ diff --git a/gcc/gmp/mpn/arm64/cora57/gmp-mparam.h b/gcc/gmp/mpn/arm64/cora57/gmp-mparam.h new file mode 100644 index 0000000..0d38621 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/cora57/gmp-mparam.h @@ -1,0 +1,187 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a57, a72-a75. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-02, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 4 +#define MOD_1_1P_METHOD 1 /* 2.21% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 42 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 34.95% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 5 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 41 +#define MUL_TOOM44_THRESHOLD 99 +#define MUL_TOOM6H_THRESHOLD 142 +#define MUL_TOOM8H_THRESHOLD 199 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 65 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 55 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 65 +#define SQR_TOOM4_THRESHOLD 166 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 7 +#define SQRMOD_BNM1_THRESHOLD 12 + +#define MUL_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 21, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \ + { 39,10}, { 23, 9}, { 51,11}, { 15,10}, \ + { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ + { 47, 9}, { 99,10}, { 55,11}, { 31,10}, \ + { 63, 8}, { 255,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63, 9}, { 255, 8}, { 511,10}, \ + { 143, 8}, { 575,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351, 8}, { 703,11}, \ + { 95,10}, { 191, 9}, { 383,10}, { 207, 9}, \ + { 415,10}, { 223, 9}, { 447, 8}, { 895,12}, \ + { 63,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575, 8}, \ + { 1151,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351, 9}, { 703,12}, { 95,10}, { 383, 9}, \ + { 767,11}, { 207, 9}, { 831,11}, { 223,10}, \ + { 447, 9}, { 895,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 109 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 8, 5}, { 17, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255,10}, { 71, 8}, { 287, 7}, { 575, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ + { 575,11}, { 79,10}, { 159, 9}, { 319, 8}, \ + { 639, 9}, { 351,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,10}, { 239,12}, { 63,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \ + { 831,11}, { 223,13}, { 8192,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 97 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 56 +#define SQRLO_SQR_THRESHOLD 4940 + +#define DC_DIV_QR_THRESHOLD 41 +#define DC_DIVAPPR_Q_THRESHOLD 136 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 89 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 154 +#define INV_APPR_THRESHOLD 141 + +#define BINV_NEWTON_THRESHOLD 182 +#define REDC_1_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 75 +#define MU_BDIV_QR_THRESHOLD 872 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 1,19,117,539,1730 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 572 +#define SET_STR_PRECOMPUTE_THRESHOLD 1036 + +#define FAC_DSC_THRESHOLD 142 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD2_DIV1_METHOD 1 /* 8.83% faster than 3 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 70 +#define HGCD_REDUCE_THRESHOLD 1962 +#define GCD_DC_THRESHOLD 273 +#define GCDEXT_DC_THRESHOLD 198 +#define JACOBI_BASE_METHOD 1 /* 7.49% faster than 4 */ diff --git a/gcc/gmp/mpn/arm64/cora72/gmp-mparam.h b/gcc/gmp/mpn/arm64/cora72/gmp-mparam.h new file mode 100644 index 0000000..fc66fd3 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/cora72/gmp-mparam.h @@ -1,0 +1,242 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a72. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 50,811,960 */ +/* Generated by tuneup.c, 2019-10-22, gcc 7.3 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 /* 12.09% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 26 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 15 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 13.42% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD 4 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 8 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 153 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 57 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 104 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 56 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 16 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 206 +#define SQR_TOOM8_THRESHOLD 333 + +#define MULMID_TOOM42_THRESHOLD 18 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 10 + +#define MUL_FFT_MODF_THRESHOLD 268 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 268, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 15, 7}, { 13, 8}, { 7, 7}, { 16, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 15, 7}, { 31, 8}, \ + { 19, 9}, { 11, 8}, { 27,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 51,11}, { 15,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 79, 8}, { 159, 7}, { 319, 9}, \ + { 83,10}, { 47, 9}, { 95, 7}, { 383, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 63, 8}, \ + { 255, 7}, { 511, 9}, { 131,10}, { 71, 9}, \ + { 143, 8}, { 287, 7}, { 575, 6}, { 1151,10}, \ + { 79, 8}, { 319, 7}, { 639,10}, { 87, 8}, \ + { 351,11}, { 47,10}, { 95, 8}, { 383, 7}, \ + { 767,10}, { 103, 8}, { 415, 7}, { 831, 6}, \ + { 1663, 9}, { 223, 8}, { 447,12}, { 31,11}, \ + { 63, 9}, { 255, 8}, { 511, 7}, { 1023, 9}, \ + { 287, 8}, { 575, 7}, { 1151, 6}, { 2303, 7}, \ + { 1215,11}, { 79, 9}, { 319, 8}, { 639, 7}, \ + { 1279, 9}, { 351, 8}, { 703, 7}, { 1407, 6}, \ + { 2815, 9}, { 383, 8}, { 831, 7}, { 1663, 9}, \ + { 447, 8}, { 895, 7}, { 1791, 6}, { 3583, 8}, \ + { 959, 6}, { 3839, 5}, { 7679, 9}, { 511, 8}, \ + { 1023, 7}, { 2175, 9}, { 575, 8}, { 1151, 7}, \ + { 2303, 8}, { 1215,10}, { 351, 9}, { 703, 7}, \ + { 3071, 8}, { 1663, 9}, { 895, 8}, { 1791, 7}, \ + { 3583, 8}, { 1919, 6}, { 7679, 7}, { 3967, 9}, \ + { 1023,10}, { 575, 9}, { 1151, 8}, { 2559,10}, \ + { 703, 8}, { 2815, 9}, { 1471, 7}, { 5887,10}, \ + { 767,11}, { 415, 9}, { 1791, 8}, { 3583,11}, \ + { 479,10}, { 959, 8}, { 3967,11}, { 511, 9}, \ + { 2175,10}, { 1151, 8}, { 4607, 9}, { 2815,10}, \ + { 1471, 9}, { 2943,11}, { 767,10}, { 1535,11}, \ + { 831,10}, { 1791,11}, { 959,10}, { 1919, 9}, \ + { 3839, 8}, { 7679,10}, { 1983,12}, { 511,10}, \ + { 2047,11}, { 1215,12}, { 639,11}, { 1407,10}, \ + { 2815,11}, { 1471,12}, { 767,11}, { 1663,12}, \ + { 895,11}, { 1791,12}, { 959,11}, { 1919,10}, \ + { 3839,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1215,13}, { 639,12}, { 1279,13}, \ + { 703,12}, { 1407,11}, { 2815,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1663,13}, { 895,12}, \ + { 1791,11}, { 3583,13}, { 959,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1407,12}, \ + { 2815,13}, { 1471,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1791,12}, { 3583,13}, \ + { 1919,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2431,14}, { 1279,13}, { 2559,15}, { 767,14}, \ + { 1791,13}, { 3839,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,15}, { 1535,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 218 +#define MUL_FFT_THRESHOLD 2688 + +#define SQR_FFT_MODF_THRESHOLD 236 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 236, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79, 8}, { 159,10}, { 47, 9}, \ + { 95, 8}, { 191, 7}, { 383,10}, { 55,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255, 7}, \ + { 511,10}, { 71, 9}, { 143, 8}, { 287, 7}, \ + { 575,10}, { 79, 8}, { 319, 7}, { 639,11}, \ + { 47,10}, { 95, 8}, { 383, 7}, { 767, 8}, \ + { 415,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 543, 9}, { 287, 8}, { 575, 7}, \ + { 1151, 9}, { 319, 8}, { 639, 9}, { 351, 8}, \ + { 703, 7}, { 1407, 6}, { 2815,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 415, 8}, { 831, 7}, \ + { 1663,10}, { 223, 9}, { 447, 8}, { 895, 7}, \ + { 1791, 9}, { 479, 8}, { 959,12}, { 63,11}, \ + { 127, 9}, { 543, 8}, { 1087,10}, { 287, 9}, \ + { 575, 8}, { 1151,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703, 8}, { 1407, 7}, { 2815, 8}, \ + { 1471, 5}, { 11775, 9}, { 767, 8}, { 1535,10}, \ + { 415, 9}, { 895, 8}, { 1919, 6}, { 7679, 7}, \ + { 3967,11}, { 255,10}, { 543, 9}, { 1087, 8}, \ + { 2175,10}, { 575, 9}, { 1151, 8}, { 2431,10}, \ + { 639, 9}, { 1279,10}, { 703, 9}, { 1407, 8}, \ + { 2943,11}, { 383,10}, { 767,11}, { 447,10}, \ + { 895,11}, { 479,10}, { 959, 9}, { 1919, 8}, \ + { 3839,10}, { 1023, 9}, { 2175,10}, { 1215, 9}, \ + { 2431,11}, { 703, 9}, { 2815,10}, { 1471,11}, \ + { 767,10}, { 1663,11}, { 895,10}, { 1791,11}, \ + { 959, 9}, { 3839,12}, { 511,11}, { 1087,10}, \ + { 2175,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1471,12}, { 767,11}, \ + { 1663,12}, { 895,11}, { 1919,10}, { 3839,13}, \ + { 511,12}, { 1087,11}, { 2175,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,13}, { 767,12}, { 1535,13}, { 831,12}, \ + { 1791,13}, { 1151,12}, { 2303,13}, { 1215,14}, \ + { 639,12}, { 2559,13}, { 1407,14}, { 767,12}, \ + { 3071,14}, { 895,13}, { 1919,12}, { 3839,14}, \ + { 1023,13}, { 2175,14}, { 1151,12}, { 4607,14}, \ + { 1279,13}, { 2559,14}, { 1407,13}, { 2943,15}, \ + { 767,14}, { 1663,13}, { 3583,14}, { 1919,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2943,15}, \ + { 1535,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 203 +#define SQR_FFT_THRESHOLD 2176 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 5240 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 45 +#define SQRLO_SQR_THRESHOLD 4265 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 108 +#define DC_BDIV_QR_THRESHOLD 36 +#define DC_BDIV_Q_THRESHOLD 71 + +#define INV_MULMOD_BNM1_THRESHOLD 14 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 124 + +#define BINV_NEWTON_THRESHOLD 199 +#define REDC_1_TO_REDC_N_THRESHOLD 34 + +#define MU_DIV_QR_THRESHOLD 979 +#define MU_DIVAPPR_Q_THRESHOLD 979 +#define MUPI_DIV_QR_THRESHOLD 61 +#define MU_BDIV_QR_THRESHOLD 734 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 6,30,110,579,1730 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 458 +#define SET_STR_PRECOMPUTE_THRESHOLD 875 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 8.41% faster than 3 */ +#define HGCD_THRESHOLD 81 +#define HGCD_APPR_THRESHOLD 80 +#define HGCD_REDUCE_THRESHOLD 1494 +#define GCD_DC_THRESHOLD 268 +#define GCDEXT_DC_THRESHOLD 189 +#define JACOBI_BASE_METHOD 1 /* 10.80% faster than 4 */ + +/* Tuneup completed successfully, took 96906 seconds */ diff --git a/gcc/gmp/mpn/arm64/cora73/gmp-mparam.h b/gcc/gmp/mpn/arm64/cora73/gmp-mparam.h new file mode 100644 index 0000000..7fc7f4e 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/cora73/gmp-mparam.h @@ -1,0 +1,225 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file for a73. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 1800 MHz Cortex-A72 */ +/* FFT tuning limit = 48,820,337 */ +/* Generated by tuneup.c, 2019-10-22, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 1 /* 2.28% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 35.13% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 5 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 10 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 89 +#define MUL_TOOM6H_THRESHOLD 141 +#define MUL_TOOM8H_THRESHOLD 199 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 69 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 65 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 66 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 18 +#define SQR_TOOM3_THRESHOLD 62 +#define SQR_TOOM4_THRESHOLD 166 +#define SQR_TOOM6_THRESHOLD 222 +#define SQR_TOOM8_THRESHOLD 309 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 8 +#define SQRMOD_BNM1_THRESHOLD 11 + +#define MUL_FFT_MODF_THRESHOLD 276 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 276, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 9, 6}, \ + { 19, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 27,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 43,10}, { 23, 9}, \ + { 51,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 63, 9}, { 127, 8}, \ + { 255, 9}, { 131,10}, { 71, 9}, { 143, 8}, \ + { 287,10}, { 79, 9}, { 159, 8}, { 319,11}, \ + { 47, 9}, { 191, 8}, { 383, 7}, { 767, 8}, \ + { 415,12}, { 31,11}, { 63, 9}, { 255, 8}, \ + { 511,10}, { 143, 9}, { 287, 8}, { 575,11}, \ + { 79,10}, { 159, 9}, { 319,10}, { 175, 9}, \ + { 351, 8}, { 703,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767,10}, { 207, 9}, { 415,10}, \ + { 223, 9}, { 447,12}, { 63,10}, { 255, 9}, \ + { 511, 8}, { 1023, 9}, { 543,11}, { 143, 9}, \ + { 575,10}, { 319, 9}, { 639,10}, { 351, 9}, \ + { 703,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,10}, { 415,11}, { 223,10}, { 447, 9}, \ + { 895,13}, { 63,11}, { 255,10}, { 511,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703, 9}, { 1407,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,10}, \ + { 959,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 575,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 575,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 831,12}, \ + { 1663,13}, { 959,15}, { 255,14}, { 511,13}, \ + { 1151,14}, { 639,13}, { 1407,14}, { 767,13}, \ + { 1663,14}, { 895,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2047,14}, { 1151,13}, { 2431,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,15}, { 1279,14}, { 2815,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 185 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 244 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 244, 5}, { 13, 6}, { 7, 5}, { 15, 6}, \ + { 17, 7}, { 9, 6}, { 19, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 31, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 8}, { 255,10}, { 71, 9}, \ + { 143, 8}, { 287,10}, { 79, 9}, { 159, 8}, \ + { 319,11}, { 47,10}, { 95, 9}, { 191, 8}, \ + { 383,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 287, 8}, { 575,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175, 9}, { 351, 8}, \ + { 703,11}, { 95, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415,10}, { 223, 8}, { 895,10}, \ + { 239,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351, 9}, \ + { 703,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,10}, { 479,11}, { 255,10}, \ + { 511,11}, { 287,10}, { 575,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,10}, { 895,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 511,12}, { 287,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,11}, { 959,12}, { 511,11}, { 1023,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,13}, { 383,12}, { 831,13}, { 447,12}, \ + { 895,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 703,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1151,14}, \ + { 639,13}, { 1407,14}, { 767,13}, { 1535,14}, \ + { 895,15}, { 511,14}, { 1151,13}, { 2431,14}, \ + { 1407,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,15}, { 1279,14}, { 2815,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 165 +#define SQR_FFT_THRESHOLD 2496 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 4 +#define SQRLO_DC_THRESHOLD 56 +#define SQRLO_SQR_THRESHOLD 4940 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 136 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 139 + +#define BINV_NEWTON_THRESHOLD 166 +#define REDC_1_TO_REDC_N_THRESHOLD 38 + +#define MU_DIV_QR_THRESHOLD 1120 +#define MU_DIVAPPR_Q_THRESHOLD 1078 +#define MUPI_DIV_QR_THRESHOLD 68 +#define MU_BDIV_QR_THRESHOLD 889 +#define MU_BDIV_Q_THRESHOLD 942 + +#define POWM_SEC_TABLE 4,22,102,473,1730 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 140 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 11 +#define HGCD2_DIV1_METHOD 1 /* 7.84% faster than 3 */ +#define HGCD_THRESHOLD 80 +#define HGCD_APPR_THRESHOLD 80 +#define HGCD_REDUCE_THRESHOLD 1679 +#define GCD_DC_THRESHOLD 273 +#define GCDEXT_DC_THRESHOLD 201 +#define JACOBI_BASE_METHOD 1 /* 1.03% faster than 4 */ + +/* Tuneup completed successfully, took 64972 seconds */ diff --git a/gcc/gmp/mpn/arm64/xgene1/gmp-mparam.h b/gcc/gmp/mpn/arm64/xgene1/gmp-mparam.h new file mode 100644 index 0000000..7cc3cb3 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm64/xgene1/gmp-mparam.h @@ -1,0 +1,181 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2400 MHz AppliedMicro X-Gene */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-28, gcc 4.8 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_1P_METHOD 1 /* 2.00% faster than 2 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 1 /* 37.38% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 14 +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 249 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 61 +#define MUL_TOOM44_THRESHOLD 112 +#define MUL_TOOM6H_THRESHOLD 242 +#define MUL_TOOM8H_THRESHOLD 321 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 99 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 109 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always */ +#define SQR_TOOM2_THRESHOLD 28 +#define SQR_TOOM3_THRESHOLD 81 +#define SQR_TOOM4_THRESHOLD 154 +#define SQR_TOOM6_THRESHOLD 214 +#define SQR_TOOM8_THRESHOLD 284 + +#define MULMID_TOOM42_THRESHOLD 46 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ + { 19, 7}, { 12, 6}, { 25, 7}, { 17, 8}, \ + { 9, 7}, { 20, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 71, 9}, { 143,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135, 9}, { 271,10}, \ + { 143,11}, { 79, 9}, { 319,10}, { 167, 9}, \ + { 351,11}, { 95, 9}, { 383, 8}, { 767,10}, \ + { 207, 9}, { 415,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 319, 9}, { 639,10}, { 351,12}, { 95,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,13}, { 8192,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 98 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 19, 7}, { 10, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 135, 9}, { 271,11}, { 79, 9}, { 319, 8}, \ + { 639,10}, { 175,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 319, 9}, { 639,11}, \ + { 175,10}, { 351,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 87 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 45 +#define MULLO_MUL_N_THRESHOLD 8648 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 108 +#define SQRLO_SQR_THRESHOLD 6461 + +#define DC_DIV_QR_THRESHOLD 64 +#define DC_DIVAPPR_Q_THRESHOLD 222 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 132 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 242 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 254 +#define REDC_1_TO_REDC_N_THRESHOLD 66 + +#define MU_DIV_QR_THRESHOLD 1234 +#define MU_DIVAPPR_Q_THRESHOLD 1234 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1210 +#define MU_BDIV_Q_THRESHOLD 1234 + +#define POWM_SEC_TABLE 3,23,194,712,2499 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 22 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 2503 + +#define FAC_DSC_THRESHOLD 216 +#define FAC_ODD_THRESHOLD 26 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 5 /* 2.01% faster than 3 */ +#define HGCD_THRESHOLD 122 +#define HGCD_APPR_THRESHOLD 171 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 541 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 4 /* 7.46% faster than 1 */ diff --git a/gcc/gmp/mpn/mips64/hilo/addmul_1.asm b/gcc/gmp/mpn/mips64/hilo/addmul_1.asm new file mode 100644 index 0000000..8ff0976 100644 --- /dev/null +++ b/gcc/gmp/mpn/mips64/hilo/addmul_1.asm @@ -1,0 +1,101 @@ +dnl MIPS64 mpn_addmul_1 -- Multiply a limb vector with a single limb and add +dnl the product to a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_addmul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_addmul_1) diff --git a/gcc/gmp/mpn/mips64/hilo/mul_1.asm b/gcc/gmp/mpn/mips64/hilo/mul_1.asm new file mode 100644 index 0000000..77acf0a 100644 --- /dev/null +++ b/gcc/gmp/mpn/mips64/hilo/mul_1.asm @@ -1,0 +1,92 @@ +dnl MIPS64 mpn_mul_1 -- Multiply a limb vector with a single limb and store +dnl the product in a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_mul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: nop + mflo $10 + mfhi $9 + daddiu $5,$5,8 + daddu $10,$10,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$10,$2 C carry from previous addition -> $2 + nop + nop + sd $10,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + dmultu $8,$7 + sd $10,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + sd $10,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_mul_1) diff --git a/gcc/gmp/mpn/mips64/hilo/sqr_diagonal.asm b/gcc/gmp/mpn/mips64/hilo/sqr_diagonal.asm new file mode 100644 index 0000000..dcb87dc 100644 --- /dev/null +++ b/gcc/gmp/mpn/mips64/hilo/sqr_diagonal.asm @@ -1,0 +1,77 @@ +dnl MIPS64 mpn_sqr_diagonal. + +dnl Copyright 2001, 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl INPUT PARAMETERS +dnl rp $4 +dnl up $5 +dnl n $6 + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_sqr_diagonal) + ld r8,0(r5) + daddiu r6,r6,-2 + dmultu r8,r8 + bltz r6,$Lend1 + nop + ld r8,8(r5) + beq r6,r0,$Lend2 + nop + +$Loop: mflo r10 + mfhi r9 + daddiu r6,r6,-1 + sd r10,0(r4) + sd r9,8(r4) + dmultu r8,r8 + ld r8,16(r5) + daddiu r5,r5,8 + bne r6,r0,$Loop + daddiu r4,r4,16 + +$Lend2: mflo r10 + mfhi r9 + sd r10,0(r4) + sd r9,8(r4) + dmultu r8,r8 + mflo r10 + mfhi r9 + sd r10,16(r4) + j r31 + sd r9,24(r4) + +$Lend1: mflo r10 + mfhi r9 + sd r10,0(r4) + j r31 + sd r9,8(r4) +EPILOGUE(mpn_sqr_diagonal) diff --git a/gcc/gmp/mpn/mips64/hilo/submul_1.asm b/gcc/gmp/mpn/mips64/hilo/submul_1.asm new file mode 100644 index 0000000..089589c 100644 --- /dev/null +++ b/gcc/gmp/mpn/mips64/hilo/submul_1.asm @@ -1,0 +1,101 @@ +dnl MIPS64 mpn_submul_1 -- Multiply a limb vector with a single limb and +dnl subtract the product from a second limb vector. + +dnl Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr $4 +C s1_ptr $5 +C size $6 +C s2_limb $7 + +ASM_START() +PROLOGUE(mpn_submul_1) + +C feed-in phase 0 + ld $8,0($5) + +C feed-in phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 C zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) C load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 C add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) C load new s1 limb as early as possible + daddiu $6,$6,-1 C decrement loop counter + sltu $2,$3,$2 C carry from previous addition -> $2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 C add high product limb and carry from addition + +C wind-down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 C add high product limb and carry from addition +EPILOGUE(mpn_submul_1) diff --git a/gcc/gmp/mpn/mips64/hilo/umul.asm b/gcc/gmp/mpn/mips64/hilo/umul.asm new file mode 100644 index 0000000..b9aac57 100644 --- /dev/null +++ b/gcc/gmp/mpn/mips64/hilo/umul.asm @@ -1,0 +1,45 @@ +dnl MIPS64 umul_ppmm -- longlong.h support. + +dnl Copyright 2002 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C plp $4 +C u $5 +C v $6 + +ASM_START() +PROLOGUE(mpn_umul_ppmm) + dmultu $5,$6 + mflo $3 + mfhi $2 + j $31 + sd $3,0($4) +EPILOGUE(mpn_umul_ppmm) diff --git a/gcc/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm b/gcc/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm new file mode 100644 index 0000000..307aafc 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm @@ -1,0 +1,146 @@ +dnl PowerPC-64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb +dnl divisor. + +dnl Copyright 2006, 2010, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C POWER3/PPC630 13-19 +C POWER4/PPC970 16 +C POWER5 16 16 +C POWER6 37 46 +C POWER7 12 12 +C POWER8 12 12 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`d', `r6') +define(`di', `r7') +define(`cnt',`r8') + +define(`tnc',`r10') + +ASM_START() + +EXTERN(binvert_limb_table) + +PROLOGUE(mpn_bdiv_q_1,toc) + addi r7, n, -1 + cmpdi cr1, n, 1 + ld r12, 0(up) + li cnt, 0 + neg r0, d + and r0, d, r0 + cntlzd r0, r0 + subfic cnt, r0, 63 + srd d, d, cnt +L(7): + mtctr r7 + LEA( r10, binvert_limb_table) + rldicl r11, d, 63, 57 + lbzx r0, r10, r11 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf r0, r9, r0 + mulld r10, r0, r0 + sldi r0, r0, 1 + mulld r10, d, r10 + subf r0, r10, r0 + mulld r9, r0, r0 + sldi r0, r0, 1 + mulld r9, d, r9 + subf di, r9, r0 C di = 1/d mod 2^64 +ifdef(`AIX', +` C For AIX it is not clear how to jump into another function. + b .mpn_pi1_bdiv_q_1 +',` + C For non-AIX, dispatch into the pi1 variant. + bne cr0, L(norm) + b L(unorm) +') +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + cmpdi cr0, cnt, 0 + ld r12, 0(up) + addic r0, n, -1 C set carry as side effect + cmpdi cr1, n, 1 + mtctr r0 + beq cr0, L(norm) + +L(unorm): + subfic tnc, cnt, 64 C set carry as side effect + li r5, 0 + srd r11, r12, cnt + beq cr1, L(ed1) + + ALIGN(16) +L(tpu): ld r12, 8(up) + nop + addi up, up, 8 + sld r0, r12, tnc + or r11, r11, r0 + subfe r9, r5, r11 + srd r11, r12, cnt + mulld r0, di, r9 + mulhdu r5, r0, d + std r0, 0(rp) + addi rp, rp, 8 + bdnz L(tpu) + + subfe r11, r5, r11 +L(ed1): mulld r0, di, r11 + std r0, 0(rp) + blr + + ALIGN(16) +L(norm): + mulld r11, r12, di + mulhdu r5, r11, d + std r11, 0(rp) + beqlr cr1 + + ALIGN(16) +L(tpn): ld r9, 8(up) + addi up, up, 8 + subfe r5, r5, r9 + mulld r11, di, r5 + mulhdu r5, r11, d C result not used in last iteration + std r11, 8(rp) + addi rp, rp, 8 + bdnz L(tpn) + + blr +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/powerpc64/mode64/gcd_11.asm b/gcc/gmp/mpn/powerpc64/mode64/gcd_11.asm new file mode 100644 index 0000000..f9792e5 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/gcd_11.asm @@ -1,0 +1,77 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 ? +C POWER4/PPC970 8.5 obsolete +C POWER5 ? +C POWER6 ? +C POWER7 9.4 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`mask', `r0')dnl +define(`a1', `r4')dnl +define(`a2', `r5')dnl +define(`d1', `r6')dnl +define(`d2', `r7')dnl +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + mr r8, v0 + subf. r10, u0, v0 C r10 = d - a + beq L(end) + + ALIGN(16) +L(top): subfc r11, r8, r3 C r11 = a - d + and d2, r11, r10 + subfe mask, mask, mask + cntlzd cnt, d2 + and a1, r10, mask C d - a + andc a2, r11, mask C a - d + and d1, r3, mask C a + andc d2, r8, mask C d + or r3, a1, a2 C new a + subf cnt, cnt, r12 + or r8, d1, d2 C new d + srd r3, r3, cnt + subf. r10, r3, r8 C r10 = d - a + bne L(top) + +L(end): blr +EPILOGUE() diff --git a/gcc/gmp/mpn/riscv/64/aors_n.asm b/gcc/gmp/mpn/riscv/64/aors_n.asm new file mode 100644 index 0000000..6e38083 100644 --- /dev/null +++ b/gcc/gmp/mpn/riscv/64/aors_n.asm @@ -1,0 +1,89 @@ +dnl RISC-V/64 mpn_add_n and mpn_sub_n. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`up', `a1') +define(`vp', `a2') +define(`n', `a3') + +ifdef(`OPERATION_add_n',` + define(`ADDSUB', `add') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_add_n') +') +ifdef(`OPERATION_sub_n',` + define(`ADDSUB', `sub') + define(`CMPCY', `sltu $1, $3, $2') + define(`func', `mpn_sub_n') +') + +MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) + +ASM_START() +PROLOGUE(func) + li t6, 0 + + andi t0, n, 1 + beq t0, x0, L(top) + addi up, up, 8 + addi vp, vp, -8 + addi rp, rp, -8 + addi n, n, -1 + j L(mid) + +L(top): ld a4, 0(up) + ld a6, 0(vp) + addi n, n, -2 C bookkeeping + addi up, up, 16 C bookkeeping + ADDSUB t0, a4, a6 + CMPCY( t2, t0, a4) + ADDSUB t4, t0, t6 C cycle 3, 9, ... + CMPCY( t3, t4, t0) C cycle 4, 10, ... + sd t4, 0(rp) + add t6, t2, t3 C cycle 5, 11, ... +L(mid): ld a5, -8(up) + ld a7, 8(vp) + addi vp, vp, 16 C bookkeeping + addi rp, rp, 16 C bookkeeping + ADDSUB t1, a5, a7 + CMPCY( t2, t1, a5) + ADDSUB t4, t1, t6 C cycle 0, 6, ... + CMPCY( t3, t4, t1) C cycle 1, 7, ... + sd t4, -8(rp) + add t6, t2, t3 C cycle 2, 8, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, t6 + ret +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/riscv/64/aorsmul_1.asm b/gcc/gmp/mpn/riscv/64/aorsmul_1.asm new file mode 100644 index 0000000..1125a9f 100644 --- /dev/null +++ b/gcc/gmp/mpn/riscv/64/aorsmul_1.asm @@ -1,0 +1,75 @@ +dnl RISC-V/64 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`up', `a1') +define(`n', `a2') +define(`v0', `a3') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`CMPCY', `sltu $1, $2, $3') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`CMPCY', `sltu $1, $3, $2') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + li a6, 0 + +L(top): ld a7, 0(up) + addi up, up, 8 C bookkeeping + ld a4, 0(rp) + addi rp, rp, 8 C bookkeeping + mul a5, a7, v0 + addi n, n, -1 C bookkeeping + mulhu a7, a7, v0 + ADDSUB a5, a4, a5 + ADDSUB a6, a5, a6 C cycle 0, 3, ... + CMPCY( a4, a5, a4) + add a4, a4, a7 + CMPCY( a5, a6, a5) C cycle 1, 4, ... + sd a6, -8(rp) + add a6, a4, a5 C cycle 2, 5, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, a6 + ret +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/riscv/64/mul_1.asm b/gcc/gmp/mpn/riscv/64/mul_1.asm new file mode 100644 index 0000000..e35eaa9 100644 --- /dev/null +++ b/gcc/gmp/mpn/riscv/64/mul_1.asm @@ -1,0 +1,58 @@ +dnl RISC-V/64 mpn_mul_1. + +dnl Copyright 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rp', `a0') +define(`up', `a1') +define(`n', `a2') +define(`v0', `a3') + +ASM_START() +PROLOGUE(mpn_mul_1) + li a6, 0 + +L(top): ld a7, 0(up) + addi up, up, 8 C bookkeeping + addi rp, rp, 8 C bookkeeping + mul a5, a7, v0 + addi n, n, -1 C bookkeeping + mulhu a7, a7, v0 + add a6, a5, a6 C cycle 0, 3, ... + sltu a5, a6, a5 C cycle 1, 4, ... + sd a6, -8(rp) + add a6, a7, a5 C cycle 2, 5, ... + bne n, x0, L(top) C bookkeeping + +L(end): mv a0, a6 + ret +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm b/gcc/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm new file mode 100644 index 0000000..9847047 100644 --- /dev/null +++ b/gcc/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm @@ -1,0 +1,137 @@ +dnl SPARC T3/T4/T5 mpn_bdiv_q_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC T3: 31 +C UltraSPARC T4/T5: 20-26 hits 20 early, then sharply drops + +C INPUT PARAMETERS +define(`qp', `%i0') +define(`ap', `%i1') +define(`n', `%i2') +define(`d', `%i3') +define(`dinv',`%i4') +define(`cnt', `%i5') + +define(`tnc', `%o2') + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) +PROLOGUE(mpn_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + add d, -1, %g1 + andn %g1, d, %g1 + popc %g1, cnt + + srlx d, cnt, d + srlx d, 1, %g1 + and %g1, 127, %g1 + LEA64(binvert_limb_table, g2, g4) + ldub [%g2+%g1], %g1 + add %g1, %g1, %g2 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + sub %g2, %g1, %g2 + add %g2, %g2, %g1 + mulx %g2, %g2, %g2 + mulx %g2, d, %g2 + sub %g1, %g2, %g1 + add %g1, %g1, %o7 + mulx %g1, %g1, %g1 + mulx %g1, d, %g1 + add n, -2, n + brz,pt cnt, L(norm) + sub %o7, %g1, dinv + + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + b L(eee) + mov 0, %g4 +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + save %sp, -176, %sp + ldx [ap], %o5 + + brz,pt cnt, L(norm) + add n, -2, n + +L(unorm): + brlz,pt n, L(edu) + srlx %o5, cnt, %o5 + mov 0, %g4 +L(eee): sub %g0, cnt, tnc + +L(tpu): ldx [ap+8], %g3 + add ap, 8, ap + sllx %g3, tnc, %g5 + or %g5, %o5, %g5 + srlx %g3, cnt, %o5 + subcc %g5, %g4, %g4 + mulx %g4, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + umulxhi(d, %g1, %g1) + addxc( %g1, %g0, %g4) + brgz,pt n, L(tpu) + add n, -1, n + + sub %o5, %g4, %o5 +L(edu): mulx %o5, dinv, %g1 + return %i7+8 + stx %g1, [%o0] + +L(norm): + mulx dinv, %o5, %g1 + brlz,pt n, L(edn) + stx %g1, [qp] + add qp, 8, qp + addcc %g0, 0, %g4 + +L(tpn): umulxhi(d, %g1, %g1) + ldx [ap+8], %g5 + add ap, 8, ap + addxc( %g1, %g0, %g1) + subcc %g5, %g1, %g1 + mulx %g1, dinv, %g1 + stx %g1, [qp] + add qp, 8, qp + brgz,pt n, L(tpn) + add n, -1, n + +L(edn): return %i7+8 + nop +EPILOGUE() diff --git a/gcc/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h b/gcc/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h new file mode 100644 index 0000000..2fecdba 100644 --- /dev/null +++ b/gcc/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h @@ -1,0 +1,173 @@ +/* Sparc64 T4-T5 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600 MHz ultrasparct5 running GNU/Linux */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-10-01, gcc 7.4 */ + +#define DIVREM_1_NORM_THRESHOLD 3 +#define DIVREM_1_UNNORM_THRESHOLD 3 +#define MOD_1_1P_METHOD 2 /* 0.34% faster than 1 */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 +#define DIV_QR_1N_PI1_METHOD 2 /* 27.84% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 5 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 654 + +#define MUL_TOOM22_THRESHOLD 40 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 372 +#define MUL_TOOM6H_THRESHOLD 494 +#define MUL_TOOM8H_THRESHOLD 656 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 126 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 247 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 225 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 219 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 188 + +#define SQR_BASECASE_THRESHOLD 20 +#define SQR_TOOM2_THRESHOLD 59 +#define SQR_TOOM3_THRESHOLD 107 +#define SQR_TOOM4_THRESHOLD 298 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 562 + +#define MULMID_TOOM42_THRESHOLD 48 + +#define MULMOD_BNM1_THRESHOLD 25 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 555 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 555, 5}, { 29, 6}, { 31, 7}, { 31, 8}, \ + { 17, 7}, { 36, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 29, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 35, 9}, { 19, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 27, 8}, \ + { 57,10}, { 15, 8}, { 61, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 59,11}, \ + { 15,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 115,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 87,11}, { 47,10}, { 111, 9}, { 223,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383, 9}, { 767,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 75 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 372 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 372, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 75 +#define SQR_FFT_THRESHOLD 3776 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 35 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 168 +#define SQRLO_SQR_THRESHOLD 7511 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 28 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 118 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1970 +#define MU_DIVAPPR_Q_THRESHOLD 1970 +#define MUPI_DIV_QR_THRESHOLD 82 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1970 + +#define POWM_SEC_TABLE 1,58,102,1509 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 686 +#define SET_STR_PRECOMPUTE_THRESHOLD 2717 + +#define FAC_DSC_THRESHOLD 336 +#define FAC_ODD_THRESHOLD 24 + +#define MATRIX22_STRASSEN_THRESHOLD 32 +#define HGCD2_DIV1_METHOD 1 /* 0.66% faster than 3 */ +#define HGCD_THRESHOLD 57 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 386 +#define GCDEXT_DC_THRESHOLD 288 +#define JACOBI_BASE_METHOD 4 /* 2.50% faster than 3 */ diff --git a/gcc/gmp/mpn/x86/bd4/gmp-mparam.h b/gcc/gmp/mpn/x86/bd4/gmp-mparam.h new file mode 100644 index 0000000..6c20d0f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/bd4/gmp-mparam.h @@ -1,0 +1,225 @@ +/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3800-4200 MHz Excavator/Bristol Ridge */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 27 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 50 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 28.45% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 13 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 28 + +#define DIV_1_VS_MUL_1_PERCENT 314 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 69 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 121 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 42 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 208 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 18 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 143, 6}, { 2303, 5}, \ + { 4735, 4}, { 9471, 5}, { 4863, 7}, { 1279, 9}, \ + { 335, 8}, { 671, 9}, { 351, 8}, { 703,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671, 8}, \ + { 1343,10}, { 351, 9}, { 703,10}, { 367, 9}, \ + { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799, 8}, { 1599,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671, 9}, { 1343,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215, 9}, { 2431,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471, 9}, { 2943,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,10}, { 1919,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,10}, \ + { 2943,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,10}, { 3455,12}, { 959,11}, \ + { 1919,10}, { 3839,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,10}, { 5887,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,12}, \ + { 7935,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,13}, { 7935,16} } +#define MUL_FFT_TABLE3_SIZE 192 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 476, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 143, 9}, { 287, 8}, { 575,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 735,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415, 9}, { 863,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671, 9}, { 1343,11}, { 351,10}, { 735,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1055,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2559,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 176 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 54 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 77 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 84 +#define DC_DIVAPPR_Q_THRESHOLD 252 +#define DC_BDIV_QR_THRESHOLD 79 +#define DC_BDIV_Q_THRESHOLD 80 + +#define INV_MULMOD_BNM1_THRESHOLD 71 +#define INV_NEWTON_THRESHOLD 254 +#define INV_APPR_THRESHOLD 266 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1387 +#define MU_BDIV_Q_THRESHOLD 1528 + +#define POWM_SEC_TABLE 1,16,96,480,960 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 264 +#define SET_STR_PRECOMPUTE_THRESHOLD 542 + +#define FAC_DSC_THRESHOLD 91 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 9.73% faster than 3 */ +#define HGCD_THRESHOLD 55 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 562 +#define GCDEXT_DC_THRESHOLD 416 +#define JACOBI_BASE_METHOD 4 /* 16.50% faster than 1 */ + +/* Tuneup completed successfully, took 49179 seconds */ diff --git a/gcc/gmp/mpn/x86/bt1/gmp-mparam.h b/gcc/gmp/mpn/x86/bt1/gmp-mparam.h new file mode 100644 index 0000000..302dbc6 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/bt1/gmp-mparam.h @@ -1,0 +1,218 @@ +/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than + the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in + k7/sqr_basecase.asm. */ + +/* 1600 MHz AMD Bobcat Zacate E-350 */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 16 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 21 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 57.16% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 36 + +#define DIV_1_VS_MUL_1_PERCENT 199 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 93 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 270 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 102 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 177 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 169 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 143 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 50 +#define SQR_TOOM3_THRESHOLD 89 +#define SQR_TOOM4_THRESHOLD 248 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 470 + +#define MULMID_TOOM42_THRESHOLD 72 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 21 + +#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 630, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 55, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 6}, \ + { 767, 7}, { 399, 6}, { 799, 7}, { 415, 8}, \ + { 235, 7}, { 479, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,11}, { 63,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1471,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 159 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 127, 6}, { 1087, 7}, { 575, 8}, { 303, 9}, \ + { 159,10}, { 95,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415, 9}, { 831,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 161 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 9 +#define MULLO_DC_THRESHOLD 48 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 7 +#define SQRLO_DC_THRESHOLD 146 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 77 +#define DC_DIVAPPR_Q_THRESHOLD 240 +#define DC_BDIV_QR_THRESHOLD 83 +#define DC_BDIV_Q_THRESHOLD 182 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 252 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1713 + +#define POWM_SEC_TABLE 1,16,96,563,1317,1867 + +#define GET_STR_DC_THRESHOLD 19 +#define GET_STR_PRECOMPUTE_THRESHOLD 32 +#define SET_STR_DC_THRESHOLD 254 +#define SET_STR_PRECOMPUTE_THRESHOLD 907 + +#define FAC_DSC_THRESHOLD 224 +#define FAC_ODD_THRESHOLD 55 + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD2_DIV1_METHOD 3 /* 3.59% faster than 5 */ +#define HGCD_THRESHOLD 85 +#define HGCD_APPR_THRESHOLD 152 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 531 +#define GCDEXT_DC_THRESHOLD 386 +#define JACOBI_BASE_METHOD 3 /* 0.92% faster than 1 */ + +/* Tuneup completed successfully, took 159946 seconds */ diff --git a/gcc/gmp/mpn/x86/bt2/gmp-mparam.h b/gcc/gmp/mpn/x86/bt2/gmp-mparam.h new file mode 100644 index 0000000..f936cb7 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/bt2/gmp-mparam.h @@ -1,0 +1,214 @@ +/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than + the value in mpn/x86/k7/gmp-mparam.h. The latter is used as a hard limit in + k7/sqr_basecase.asm. */ + +/* 2050 MHz AMD Jaguar/Kabini */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 4 +#define MOD_1_UNNORM_THRESHOLD 6 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 18 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 47.53% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 243 + +#define MUL_TOOM22_THRESHOLD 32 +#define MUL_TOOM33_THRESHOLD 90 +#define MUL_TOOM44_THRESHOLD 154 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 126 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 502 + +#define MULMID_TOOM42_THRESHOLD 68 + +#define MULMOD_BNM1_THRESHOLD 19 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 570 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 570, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 23, 6}, { 47, 7}, { 27, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 991,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3967,15}, { 1023,14}, \ + { 2047,13}, { 4479,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 153 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 530, 5}, { 27, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 95,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 991,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2495,13}, \ + { 1407,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 151 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 8 +#define MULLO_DC_THRESHOLD 44 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 13 +#define SQRLO_DC_THRESHOLD 62 +#define SQRLO_SQR_THRESHOLD 8907 + +#define DC_DIV_QR_THRESHOLD 79 +#define DC_DIVAPPR_Q_THRESHOLD 228 +#define DC_BDIV_QR_THRESHOLD 75 +#define DC_BDIV_Q_THRESHOLD 136 + +#define INV_MULMOD_BNM1_THRESHOLD 90 +#define INV_NEWTON_THRESHOLD 260 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 294 +#define REDC_1_TO_REDC_N_THRESHOLD 80 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 118 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,16,96,615,865,1442 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 27 +#define SET_STR_DC_THRESHOLD 252 +#define SET_STR_PRECOMPUTE_THRESHOLD 638 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 39 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 13.65% faster than 3 */ +#define HGCD_THRESHOLD 81 +#define HGCD_APPR_THRESHOLD 66 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 531 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 0.84% faster than 4 */ + +/* Tuneup completed successfully, took 103818 seconds */ diff --git a/gcc/gmp/mpn/x86/coreibwl/gmp-mparam.h b/gcc/gmp/mpn/x86/coreibwl/gmp-mparam.h new file mode 100644 index 0000000..7b58cad 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/coreibwl/gmp-mparam.h @@ -1,0 +1,216 @@ +/* x86/coreibwl gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 15 +#define MOD_1_UNNORM_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 21.34% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 14 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 29 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 295 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 97 +#define MUL_TOOM44_THRESHOLD 220 +#define MUL_TOOM6H_THRESHOLD 306 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 93 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 169 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 44 +#define SQR_TOOM3_THRESHOLD 134 +#define SQR_TOOM4_THRESHOLD 242 +#define SQR_TOOM6_THRESHOLD 342 +#define SQR_TOOM8_THRESHOLD 502 + +#define MULMID_TOOM42_THRESHOLD 98 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 540, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 17, 5}, { 36, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55,10}, \ + { 15, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95, 7}, { 1599, 8}, { 831, 9}, { 431, 8}, \ + { 863, 9}, { 447,10}, { 239, 9}, { 479,10}, \ + { 255, 9}, { 511,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 511, 9}, \ + { 1023,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1119,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1215,13}, { 639,12}, \ + { 1471,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3839,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2559,13}, { 5247,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 172 +#define MUL_FFT_THRESHOLD 7424 + +#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 472, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 37, 7}, { 19, 6}, { 40, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 271, 9}, { 543, 6}, { 4479, 7}, { 2431, 8}, \ + { 1247, 7}, { 2495, 8}, { 1279,10}, { 351,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,12}, \ + { 127,11}, { 255,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 927,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1663,12}, \ + { 895,11}, { 1855,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2047,12}, { 1087,11}, { 2239,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1983,14}, { 511,13}, \ + { 1023,12}, { 2239,13}, { 1151,12}, { 2495,13}, \ + { 1279,12}, { 2623,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3135,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3839,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3327,13}, { 6783,14}, \ + { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 157 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 16 +#define MULLO_DC_THRESHOLD 37 +#define MULLO_MUL_N_THRESHOLD 14281 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 10821 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 146 +#define DC_BDIV_QR_THRESHOLD 98 +#define DC_BDIV_Q_THRESHOLD 218 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 173 +#define INV_APPR_THRESHOLD 165 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1787 +#define MU_DIVAPPR_Q_THRESHOLD 1787 +#define MUPI_DIV_QR_THRESHOLD 78 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 1,16,126,416,932 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 306 +#define SET_STR_PRECOMPUTE_THRESHOLD 894 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 3 /* 5.97% faster than 1 */ +#define HGCD_THRESHOLD 73 +#define HGCD_APPR_THRESHOLD 123 +#define HGCD_REDUCE_THRESHOLD 3664 +#define GCD_DC_THRESHOLD 562 +#define GCDEXT_DC_THRESHOLD 465 +#define JACOBI_BASE_METHOD 1 /* 31.16% faster than 3 */ + +/* Tuneup completed successfully, took 35114 seconds */ diff --git a/gcc/gmp/mpn/x86/goldmont/gmp-mparam.h b/gcc/gmp/mpn/x86/goldmont/gmp-mparam.h new file mode 100644 index 0000000..3d37fa3 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/goldmont/gmp-mparam.h @@ -1,0 +1,219 @@ +/* Intel Goldmont/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 7 +#define MOD_1_UNNORM_THRESHOLD 12 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 32.79% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 32 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 228 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 193 +#define MUL_TOOM6H_THRESHOLD 286 +#define MUL_TOOM8H_THRESHOLD 399 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 125 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 137 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 280 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 60 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 368 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 368, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \ + { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287, 8}, { 575, 9}, { 303,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303, 9}, \ + { 607,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \ + { 447,12}, { 127,11}, { 255,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \ + { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 959,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1215,10}, { 2431,12}, \ + { 639,11}, { 1343,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 831,11}, { 1663,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,12}, { 7679,15}, \ + { 1023,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define MUL_FFT_TABLE3_SIZE 171 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \ + { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \ + { 39, 9}, { 23, 8}, { 47,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \ + { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303, 8}, { 607, 9}, { 319,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \ + { 223,10}, { 479,12}, { 127,11}, { 255,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,12}, { 383,11}, { 831,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 831,11}, \ + { 1663,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3839,12}, { 7679,15}, { 1023,14}, \ + { 2047,13}, { 4095,14}, { 2303,13}, { 4991,12}, \ + { 9983,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 170 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 5 +#define MULLO_DC_THRESHOLD 50 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 95 +#define SQRLO_SQR_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 68 +#define DC_DIVAPPR_Q_THRESHOLD 204 +#define DC_BDIV_QR_THRESHOLD 64 +#define DC_BDIV_Q_THRESHOLD 108 + +#define INV_MULMOD_BNM1_THRESHOLD 34 +#define INV_NEWTON_THRESHOLD 276 +#define INV_APPR_THRESHOLD 226 + +#define BINV_NEWTON_THRESHOLD 298 +#define REDC_1_TO_REDC_N_THRESHOLD 65 + +#define MU_DIV_QR_THRESHOLD 1528 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 140 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1499 + +#define POWM_SEC_TABLE 3,16,96,428,1317 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 704 +#define SET_STR_PRECOMPUTE_THRESHOLD 1358 + +#define FAC_DSC_THRESHOLD 95 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 15 +#define HGCD2_DIV1_METHOD 1 /* 5.53% faster than 3 */ +#define HGCD_THRESHOLD 172 +#define HGCD_APPR_THRESHOLD 204 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 610 +#define GCDEXT_DC_THRESHOLD 443 +#define JACOBI_BASE_METHOD 4 /* 6.53% faster than 3 */ + +/* Tuneup completed successfully, took 101563 seconds */ diff --git a/gcc/gmp/mpn/x86/k7/gcd_11.asm b/gcc/gmp/mpn/x86/k7/gcd_11.asm new file mode 100644 index 0000000..2648dfd 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/k7/gcd_11.asm @@ -1,0 +1,107 @@ +dnl x86 mpn_gcd_11 optimised for AMD K7. + +dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 5.31 +C AMD K8,K9 5.33 +C AMD K10 5.30 +C AMD bd1 ? +C AMD bobcat 7.02 +C Intel P4-2 10.1 +C Intel P4-3/4 10.0 +C Intel P6/13 5.88 +C Intel core2 6.26 +C Intel NHM 6.83 +C Intel SBR 8.50 +C Intel atom 8.90 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 6) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + + LEAL( ctz_table, %esi) + jmp L(odd) + + ALIGN(16) C +L(top): cmovc( %ecx, %eax) C u = |v - u| + cmovc( %edi, %edx) C v = min(u,v) +L(mid): and $MASK, %ecx C + movzbl (%esi,%ecx), %ecx C + jz L(shift_alot) C + shr %cl, %eax C +L(odd): mov %eax, %edi C + mov %edx, %ecx C + sub %eax, %ecx C + sub %edx, %eax C + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret + +L(shift_alot): + shr $MAXSHIFT, %eax + mov %eax, %ecx + jmp L(mid) +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/x86/p6/gcd_11.asm b/gcc/gmp/mpn/x86/p6/gcd_11.asm new file mode 100644 index 0000000..80e055e 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/p6/gcd_11.asm @@ -1,0 +1,83 @@ +dnl x86 mpn_gcd_11 optimised for processors with fast BSF. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K7 7.80 +C AMD K8,K9 7.79 +C AMD K10 4.08 +C AMD bd1 ? +C AMD bobcat 7.82 +C Intel P4-2 14.9 +C Intel P4-3/4 14.0 +C Intel P6/13 5.09 +C Intel core2 4.22 +C Intel NHM 5.00 +C Intel SBR 5.00 +C Intel atom 17.1 +C VIA nano ? +C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1 + + +define(`u0', `%eax') +define(`v0', `%edx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + push %edi + push %esi + + mov 12(%esp), %eax + mov 16(%esp), %edx + jmp L(odd) + + ALIGN(16) C K10 BD C2 NHM SBR +L(top): cmovc( %esi, %eax) C u = |v - u| 0,3 0,3 0,6 0,5 0,5 + cmovc( %edi, %edx) C v = min(u,v) 0,3 0,3 2,8 1,7 1,7 + shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8 +L(odd): mov %edx, %esi C 1 1 4 3 3 + sub %eax, %esi C 2 2 5 4 4 + bsf %esi, %ecx C 3 3 6 5 5 + mov %eax, %edi C 2 2 3 3 4 + sub %edx, %eax C 2 2 4 3 4 + jnz L(top) C + +L(end): mov %edx, %eax + pop %esi + pop %edi + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86/silvermont/gmp-mparam.h b/gcc/gmp/mpn/x86/silvermont/gmp-mparam.h new file mode 100644 index 0000000..e9f1d8f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/silvermont/gmp-mparam.h @@ -1,0 +1,222 @@ +/* Intel Silvermont/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-30, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 9 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 16 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 64.62% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 32 + +#define DIV_1_VS_MUL_1_PERCENT 204 + +#define MUL_TOOM22_THRESHOLD 26 +#define MUL_TOOM33_THRESHOLD 105 +#define MUL_TOOM44_THRESHOLD 236 +#define MUL_TOOM6H_THRESHOLD 351 +#define MUL_TOOM8H_THRESHOLD 502 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 163 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 174 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 215 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 36 +#define SQR_TOOM3_THRESHOLD 138 +#define SQR_TOOM4_THRESHOLD 360 +#define SQR_TOOM6_THRESHOLD 494 +#define SQR_TOOM8_THRESHOLD 620 + +#define MULMID_TOOM42_THRESHOLD 58 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 460, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415, 9}, { 831,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,16} } +#define MUL_FFT_TABLE3_SIZE 177 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 400 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 400, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 8}, \ + { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287, 8}, \ + { 575,10}, { 159,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351, 9}, { 735,11}, { 191,10}, { 383, 9}, \ + { 799,10}, { 415, 9}, { 831,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 735, 9}, { 1471,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1727,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1407,12}, { 2943,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7679,16} } +#define SQR_FFT_TABLE3_SIZE 175 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 56 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 137 +#define SQRLO_SQR_THRESHOLD 7373 + +#define DC_DIV_QR_THRESHOLD 76 +#define DC_DIVAPPR_Q_THRESHOLD 336 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 218 + +#define INV_MULMOD_BNM1_THRESHOLD 50 +#define INV_NEWTON_THRESHOLD 345 +#define INV_APPR_THRESHOLD 342 + +#define BINV_NEWTON_THRESHOLD 366 +#define REDC_1_TO_REDC_N_THRESHOLD 91 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1858 +#define MUPI_DIV_QR_THRESHOLD 171 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 3,17,102,404,1185 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 21 +#define SET_STR_DC_THRESHOLD 272 +#define SET_STR_PRECOMPUTE_THRESHOLD 788 + +#define FAC_DSC_THRESHOLD 132 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 0.59% faster than 3 */ +#define HGCD_THRESHOLD 142 +#define HGCD_APPR_THRESHOLD 181 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 492 +#define GCDEXT_DC_THRESHOLD 365 +#define JACOBI_BASE_METHOD 1 /* 0.41% faster than 2 */ + +/* Tuneup completed successfully, took 147027 seconds */ diff --git a/gcc/gmp/mpn/x86/skylake/gmp-mparam.h b/gcc/gmp/mpn/x86/skylake/gmp-mparam.h new file mode 100644 index 0000000..fb87957 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/skylake/gmp-mparam.h @@ -1,0 +1,211 @@ +/* x86/skylake gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 15 +#define MOD_1_UNNORM_THRESHOLD 16 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 10 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 10 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 5.63% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 12 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 17 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 18 + +#define DIV_1_VS_MUL_1_PERCENT 348 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 208 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 149 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 137 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 196 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 40 +#define SQR_TOOM3_THRESHOLD 129 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 354 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 72 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 21 + +#define MUL_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 530, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 51, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1727,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \ + { 1023,12}, { 2239,13}, { 1151,12}, { 2431,13}, \ + { 1279,12}, { 2623,13}, { 1407,12}, { 2815,14}, \ + { 767,13}, { 1663,12}, { 3455,13}, { 1919,15}, \ + { 511,14}, { 1023,13}, { 2175,12}, { 4479,13}, \ + { 2431,14}, { 1279,13}, { 2943,12}, { 5887,14}, \ + { 1535,13}, { 3455,14}, { 1791,13}, { 3967,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,16} } +#define MUL_FFT_TABLE3_SIZE 154 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 460 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 460, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 36, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 43, 9}, \ + { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,12}, { 63,11}, \ + { 127,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 351,11}, \ + { 191,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 927,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1407,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 895,11}, \ + { 1791,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2623,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,16} } +#define SQR_FFT_TABLE3_SIZE 155 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 68 +#define MULLO_MUL_N_THRESHOLD 13555 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 117 +#define SQRLO_SQR_THRESHOLD 10988 + +#define DC_DIV_QR_THRESHOLD 42 +#define DC_DIVAPPR_Q_THRESHOLD 163 +#define DC_BDIV_QR_THRESHOLD 66 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 157 + +#define BINV_NEWTON_THRESHOLD 300 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1718 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1830 + +#define POWM_SEC_TABLE 1,17,129,547,1317 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 354 +#define SET_STR_PRECOMPUTE_THRESHOLD 860 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 34 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 5 /* 1.04% faster than 3 */ +#define HGCD_THRESHOLD 114 +#define HGCD_APPR_THRESHOLD 132 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 379 +#define JACOBI_BASE_METHOD 1 /* 27.39% faster than 4 */ + +/* Tuneup completed successfully, took 31721 seconds */ diff --git a/gcc/gmp/mpn/x86/zn1/gmp-mparam.h b/gcc/gmp/mpn/x86/zn1/gmp-mparam.h new file mode 100644 index 0000000..8e6c052 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/zn1/gmp-mparam.h @@ -1,0 +1,220 @@ +/* AMD zn1/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3700-4300 MHz Pinnacle Ridge */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 3 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 14.00% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 248 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 91 +#define MUL_TOOM44_THRESHOLD 137 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 454 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 85 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 103 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 63 +#define SQR_TOOM3_THRESHOLD 98 +#define SQR_TOOM4_THRESHOLD 172 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 64 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 606 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 606, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 27, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 51, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543, 8}, { 1087,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671, 9}, { 1343,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \ + { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \ + { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,10}, { 1727,12}, { 447,11}, \ + { 959,10}, { 1919,11}, { 991,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \ + { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1471,10}, { 2943,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,10}, { 3455,12}, \ + { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2239,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \ + { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \ + { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \ + { 2495,13}, { 1279,12}, { 2623,13}, { 1407,12}, \ + { 2943,14}, { 767,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,12}, { 7935,11}, { 15871,15}, \ + { 1023,14}, { 2047,13}, { 4479,14}, { 2303,13}, \ + { 4991,12}, { 9983,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 3839,13}, { 7935,12}, { 15871,16} } +#define MUL_FFT_TABLE3_SIZE 172 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 464 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 464, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,11}, { 159, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351, 9}, { 703,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,12}, { 127,11}, { 255,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471,12}, { 383,11}, \ + { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \ + { 959,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 959,11}, { 1919,14}, { 255,13}, \ + { 511,12}, { 1087,11}, { 2239,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,12}, { 4479,13}, { 2431,14}, \ + { 1279,13}, { 2943,12}, { 5887,14}, { 1535,13}, \ + { 3455,14}, { 1791,13}, { 3839,12}, { 7679,13}, \ + { 3967,12}, { 7935,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,16} } +#define SQR_FFT_TABLE3_SIZE 173 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 60 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 161 +#define SQRLO_SQR_THRESHOLD 9335 + +#define DC_DIV_QR_THRESHOLD 71 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 63 +#define DC_BDIV_Q_THRESHOLD 126 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 274 +#define INV_APPR_THRESHOLD 228 + +#define BINV_NEWTON_THRESHOLD 274 +#define REDC_1_TO_REDC_N_THRESHOLD 71 + +#define MU_DIV_QR_THRESHOLD 1652 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 3,28,54,386,1337 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 262 +#define SET_STR_PRECOMPUTE_THRESHOLD 558 + +#define FAC_DSC_THRESHOLD 109 +#define FAC_ODD_THRESHOLD 39 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 7.49% faster than 3 */ +#define HGCD_THRESHOLD 74 +#define HGCD_APPR_THRESHOLD 70 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 327 +#define JACOBI_BASE_METHOD 1 /* 11.98% faster than 3 */ + +/* Tuneup completed successfully, took 36916 seconds */ diff --git a/gcc/gmp/mpn/x86/zn2/gmp-mparam.h b/gcc/gmp/mpn/x86/zn2/gmp-mparam.h new file mode 100644 index 0000000..152e6b7 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86/zn2/gmp-mparam.h @@ -1,0 +1,226 @@ +/* AMD zn2/32 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 3600-4400 MHz Matisse */ +/* FFT tuning limit = 67,000,000 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 15 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 4.78% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 7 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 23 + +#define DIV_1_VS_MUL_1_PERCENT 274 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 85 +#define MUL_TOOM44_THRESHOLD 166 +#define MUL_TOOM6H_THRESHOLD 290 +#define MUL_TOOM8H_THRESHOLD 430 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 26 +#define SQR_TOOM3_THRESHOLD 153 +#define SQR_TOOM4_THRESHOLD 214 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 48 + +#define MULMOD_BNM1_THRESHOLD 18 +#define SQRMOD_BNM1_THRESHOLD 24 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 127,10}, \ + { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671, 8}, { 1343,10}, { 351, 9}, { 703,10}, \ + { 367, 9}, { 735,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 415,11}, { 223,10}, { 447,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471, 9}, \ + { 2943,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,12}, { 447,11}, { 959,10}, { 1919,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \ + { 3455,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \ + { 767,12}, { 1727,11}, { 3455,13}, { 895,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \ + { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1663,12}, \ + { 3455,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3839,12}, { 7679,13}, { 3967,12}, { 7935,11}, \ + { 15871,15}, { 1023,14}, { 2047,13}, { 4351,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2815,13}, \ + { 5887,15}, { 1535,14}, { 3839,13}, { 7935,12}, \ + { 15871,16} } +#define MUL_FFT_TABLE3_SIZE 189 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 19, 6}, { 39, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \ + { 47,10}, { 15, 9}, { 31, 8}, { 63, 9}, \ + { 39, 8}, { 79, 9}, { 47,10}, { 31, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 95,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511, 9}, { 271,10}, { 143, 9}, \ + { 287, 8}, { 607, 7}, { 1215,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543, 8}, { 1087, 9}, { 607, 8}, \ + { 1215,11}, { 159, 9}, { 671, 8}, { 1343,10}, \ + { 351, 9}, { 735, 8}, { 1471,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 543, 9}, { 1087,10}, \ + { 607, 9}, { 1215, 8}, { 2431,10}, { 671, 9}, \ + { 1343,10}, { 735, 9}, { 1471,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,13}, \ + { 127,12}, { 255,11}, { 543,10}, { 1087,11}, \ + { 607,10}, { 1215, 9}, { 2431,11}, { 671,10}, \ + { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \ + { 383,11}, { 863,12}, { 447,11}, { 959,10}, \ + { 1919,13}, { 255,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \ + { 1343,12}, { 703,11}, { 1471,10}, { 2943, 9}, \ + { 5887,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1727,12}, { 959,11}, { 1919,10}, { 3839,14}, \ + { 255,13}, { 511,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1471,11}, { 2943,10}, { 5887,13}, \ + { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \ + { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \ + { 511,14}, { 1023,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3839,12}, { 7679,13}, { 3967,12}, \ + { 7935,11}, { 15871,15}, { 1023,14}, { 2047,13}, \ + { 4223,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2815,13}, { 5887,15}, { 1535,14}, { 3839,13}, \ + { 7935,12}, { 15871,16} } +#define SQR_FFT_TABLE3_SIZE 178 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 4 +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 107 +#define SQRLO_SQR_THRESHOLD 6633 + +#define DC_DIV_QR_THRESHOLD 54 +#define DC_DIVAPPR_Q_THRESHOLD 206 +#define DC_BDIV_QR_THRESHOLD 55 +#define DC_BDIV_Q_THRESHOLD 136 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 212 +#define INV_APPR_THRESHOLD 204 + +#define BINV_NEWTON_THRESHOLD 292 +#define REDC_1_TO_REDC_N_THRESHOLD 67 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 97 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,96,386,1555 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 16 +#define SET_STR_DC_THRESHOLD 303 +#define SET_STR_PRECOMPUTE_THRESHOLD 748 + +#define FAC_DSC_THRESHOLD 141 +#define FAC_ODD_THRESHOLD 55 + +#define MATRIX22_STRASSEN_THRESHOLD 20 +#define HGCD2_DIV1_METHOD 1 /* 14.03% faster than 3 */ +#define HGCD_THRESHOLD 103 +#define HGCD_APPR_THRESHOLD 127 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 396 +#define GCDEXT_DC_THRESHOLD 265 +#define JACOBI_BASE_METHOD 1 /* 47.88% faster than 4 */ + +/* Tuneup completed successfully, took 29014 seconds */ diff --git a/gcc/gmp/mpn/x86_64/atom/cnd_add_n.asm b/gcc/gmp/mpn/x86_64/atom/cnd_add_n.asm new file mode 100644 index 0000000..fcb9a0f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/atom/cnd_add_n.asm @@ -1,0 +1,38 @@ +dnl X86-64 mpn_cnd_add_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_add_n) +include_mpn(`x86_64/coreisbr/cnd_add_n.asm') diff --git a/gcc/gmp/mpn/x86_64/atom/cnd_sub_n.asm b/gcc/gmp/mpn/x86_64/atom/cnd_sub_n.asm new file mode 100644 index 0000000..9eee1c1 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/atom/cnd_sub_n.asm @@ -1,0 +1,38 @@ +dnl X86-64 mpn_cnd_sub_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_sub_n) +include_mpn(`x86_64/coreisbr/cnd_sub_n.asm') diff --git a/gcc/gmp/mpn/x86_64/bd1/addmul_2.asm b/gcc/gmp/mpn/x86_64/bd1/addmul_2.asm new file mode 100644 index 0000000..b54e91a 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd1/addmul_2.asm @@ -1,0 +1,235 @@ +dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4.2 +C AMD bd2 4.4 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`X0', `%r12') +define(`X1', `%r13') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + mov $0, R32(w2) C abuse w2 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + sub n_param, w2 + mul v0 + + test $1, R8(w2) + jnz L(bx1) + +L(bx0): mov %rdx, X0 + mov %rax, X1 + test $2, R8(w2) + jnz L(b10) + +L(b00): lea (w2), n C un = 4, 8, 12, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w3 + mul v1 + mov %rax, w0 + mov 8(up,w2,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(w2), n C un = 2, 6, 10, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w1 + mul v1 + mov %rdx, w3 + mov %rax, w2 + mov -8(up,n,8), %rax + test n, n + jz L(end) + jmp L(top) + +L(bx1): mov %rax, X0 + mov %rdx, X1 + test $2, R8(w2) + jz L(b11) + +L(b01): lea 1(w2), n C un = 1, 5, 9, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(w2), n C un = 3, 7, 11, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w0 + mov %rax, w1 + mov 8(up,w2,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, -16(rp) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up), %rax + mul v1 + mov -8(rp), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, -8(rp) + add w3, X1 + mov X1, (rp) + adc $0, %rdx + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bd1/aorrlsh_n.asm b/gcc/gmp/mpn/x86_64/bd1/aorrlsh_n.asm new file mode 100644 index 0000000..5516c9d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd1/aorrlsh_n.asm @@ -1,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/aorrlsh_n.asm') diff --git a/gcc/gmp/mpn/x86_64/bd1/aors_n.asm b/gcc/gmp/mpn/x86_64/bd1/aors_n.asm new file mode 100644 index 0000000..143c42e 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd1/aors_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gcc/gmp/mpn/x86_64/bd1/gcd_11.asm b/gcc/gmp/mpn/x86_64/bd1/gcd_11.asm new file mode 100644 index 0000000..4723093 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd1/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/bd2/gcd_11.asm b/gcc/gmp/mpn/x86_64/bd2/gcd_11.asm new file mode 100644 index 0000000..b167077 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd2/gcd_11.asm @@ -1,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD2, BD3, BT2. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 5.4 +C AMD bd2 3.72 +C AMD bd3 ? +C AMD bd4 4.12 +C AMD bt1 9.0 +C AMD bt2 3.97 +C AMD zn1 3.36 +C AMD zn2 3.33 +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov v0, %rdx + sub u0, %rdx + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + mov u0, %rax + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 + mov v0, %rdx + sub u0, %rdx C v - u + jnz L(top) + +L(end): mov v0, %rax + C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bd2/gcd_22.asm b/gcc/gmp/mpn/x86_64/bd2/gcd_22.asm new file mode 100644 index 0000000..a4f30ea 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd2/gcd_22.asm @@ -1,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 12.3 +C AMD K10 8.0 +C AMD bd1 10.0 +C AMD bd2 7.2 +C AMD bd3 ? +C AMD bd4 6.7 +C AMD bt1 13.6 +C AMD bt2 8.9 +C AMD zn1 5.7 +C AMD zn2 5.6 +C Intel P4 ? +C Intel CNR 9.7 +C Intel PNR 9.7 +C Intel NHM 9.4 +C Intel WSM 9.5 +C Intel SBR 10.3 +C Intel IBR ? +C Intel HWL 8.2 +C Intel BWL 7.4 +C Intel SKL 7.3 +C Intel atom 26.5 +C Intel SLM 17.4 +C Intel GLM 13.4 +C Intel GLM+ 12.4 +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + +C Rightshift (u1,,u0) into (u1,,u0) +L(shr): shr R8(cnt), u0 + mov u1, t1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + rep;bsf t0, cnt C tzcnt! + mov u0, s0 + mov u1, s1 + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bd4/aorrlsh_n.asm b/gcc/gmp/mpn/x86_64/bd4/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd4/aorrlsh_n.asm @@ -1,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gcc/gmp/mpn/x86_64/bd4/gcd_11.asm b/gcc/gmp/mpn/x86_64/bd4/gcd_11.asm new file mode 100644 index 0000000..4176b85 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd4/gcd_11.asm @@ -1,0 +1,96 @@ +dnl AMD64 mpn_gcd_11 optimised for AMD BD4, ZN1. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 3.73 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 3.33 +C AMD zn2 3.48 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + mov u0, %rax + mov v0, %rdx + sub u0, %rdx C v - u + jz L(end) + + ALIGN(16) +L(top): rep;bsf %rdx, %rcx C tzcnt! + sub v0, u0 C u - v + cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shrx( %rcx, u0, %rax) + shrx( %rcx, u0, u0) + mov v0, %rdx + sub %rax, %rdx C v - u + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bd4/gcd_22.asm b/gcc/gmp/mpn/x86_64/bd4/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd4/gcd_22.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gcc/gmp/mpn/x86_64/bd4/gmp-mparam.h b/gcc/gmp/mpn/x86_64/bd4/gmp-mparam.h new file mode 100644 index 0000000..9d2038c 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bd4/gmp-mparam.h @@ -1,0 +1,266 @@ +/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3800-4200 MHz Excavator/Bristol Ridge */ +/* FFT tuning limit = 461,179,335 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 17 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 52 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 13 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 25 + +#define DIV_1_VS_MUL_1_PERCENT 298 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 53 +#define MUL_TOOM44_THRESHOLD 142 +#define MUL_TOOM6H_THRESHOLD 206 +#define MUL_TOOM8H_THRESHOLD 292 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 83 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 102 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 98 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 82 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 71 +#define SQR_TOOM4_THRESHOLD 202 +#define SQR_TOOM6_THRESHOLD 298 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 14 + +#define MUL_FFT_MODF_THRESHOLD 316 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 316, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 87,11}, \ + { 47,10}, { 95, 9}, { 191,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135, 9}, { 271, 5}, { 4351, 6}, { 2303, 7}, \ + { 1215, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415, 9}, \ + { 831,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,10}, { 895,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,12}, { 607,13}, \ + { 319,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1087,13}, \ + { 575,12}, { 1151,11}, { 2303,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1535,13}, { 3071,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,17}, { 2047,16}, { 4095,15}, { 8191,16}, \ + { 4607,15}, { 9983,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 253 +#define MUL_FFT_THRESHOLD 4224 + +#define SQR_FFT_MODF_THRESHOLD 300 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 300, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 63,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \ + { 271, 8}, { 543,11}, { 79,10}, { 159, 9}, \ + { 319, 8}, { 639,10}, { 175,11}, { 95,10}, \ + { 191, 9}, { 383, 5}, { 6399, 6}, { 3327, 7}, \ + { 1727, 6}, { 3455, 7}, { 1791,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415, 9}, { 831,13}, { 63,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,10}, { 895,11}, \ + { 479,12}, { 255,11}, { 511,10}, { 1023,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,13}, { 255,12}, { 511,11}, { 1023,12}, \ + { 543,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1151,11}, { 2303,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,13}, \ + { 703,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1151,12}, \ + { 2303,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2815,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2303,13}, { 4607,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2815,13}, \ + { 5631,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,15}, { 1791,14}, { 3583,13}, { 7167,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4863,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3583,14}, { 7167,15}, \ + { 3839,14}, { 7679,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4095,15}, { 8447,16}, { 4607,15}, { 9983,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 273 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 43 +#define MULLO_MUL_N_THRESHOLD 8397 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 39 +#define DC_DIVAPPR_Q_THRESHOLD 165 +#define DC_BDIV_QR_THRESHOLD 39 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 30 +#define INV_NEWTON_THRESHOLD 177 +#define INV_APPR_THRESHOLD 155 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 43 + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 66 +#define MU_BDIV_QR_THRESHOLD 998 +#define MU_BDIV_Q_THRESHOLD 1142 + +#define POWM_SEC_TABLE 1,16,175,269,839,1420 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 552 +#define SET_STR_PRECOMPUTE_THRESHOLD 1038 + +#define FAC_DSC_THRESHOLD 151 +#define FAC_ODD_THRESHOLD 23 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 8.11% faster than 3 */ +#define HGCD_THRESHOLD 87 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 327 +#define GCDEXT_DC_THRESHOLD 241 +#define JACOBI_BASE_METHOD 4 /* 21.40% faster than 1 */ + +/* Tuneup completed successfully, took 431056 seconds */ diff --git a/gcc/gmp/mpn/x86_64/bt1/aors_n.asm b/gcc/gmp/mpn/x86_64/bt1/aors_n.asm new file mode 100644 index 0000000..9b6b5c7 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/aors_n.asm @@ -1,0 +1,159 @@ +dnl AMD64 mpn_add_n, mpn_sub_n optimised for bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1.77 +C AMD K10 1.76\1.82 +C AMD bd1 1.67\2.12 +C AMD bd2 1.62\1.82 +C AMD bd3 +C AMD bd4 1.55\2.2 +C AMD zen +C AMD bt1 2.54 +C AMD bt2 2 +C Intel P4 11 +C Intel PNR 4.76 +C Intel NHM 5.27 +C Intel SBR 2 +C Intel IBR 1.94 +C Intel HWL 1.63 +C Intel BWL 1.51 +C Intel SKL 1.51 +C Intel atom 3.56 +C Intel SLM 4 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + xor %r8, %r8 +L(ent): test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): shr $2, n + neg %r8 + mov $3, R32(%rax) + mov (up), %r10 + mov 8(up), %r11 + jmp L(lo0) + +L(b10): shr $2, n + neg %r8 + mov $1, R32(%rax) + mov (up), %r8 + mov 8(up), %r9 + jrcxz L(cj2) + jmp L(top) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): shr $2, n + neg %r8 + mov $0, R32(%rax) + mov (up), %r9 + jrcxz L(cj1) + mov 8(up), %r10 + jmp L(lo1) + + ALIGN(8) +L(b11): inc n + shr $2, n + neg %r8 + mov $2, R32(%rax) + mov (up), %r11 + jmp L(lo3) + + ALIGN(4) +L(top): mov 8(up,%rax,8), %r10 + ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(lo1): mov 16(up,%rax,8), %r11 + ADCSBB (vp,%rax,8), %r9 + lea 4(%rax), %rax + mov %r9, -32(rp,%rax,8) +L(lo0): ADCSBB -24(vp,%rax,8), %r10 + mov %r10, -24(rp,%rax,8) +L(lo3): ADCSBB -16(vp,%rax,8), %r11 + dec n + mov -8(up,%rax,8), %r8 + mov %r11, -16(rp,%rax,8) +L(lo2): mov (up,%rax,8), %r9 + jnz L(top) + +L(cj2): ADCSBB -8(vp,%rax,8), %r8 + mov %r8, -8(rp,%rax,8) +L(cj1): ADCSBB (vp,%rax,8), %r9 + mov %r9, (rp,%rax,8) + + mov $0, R32(%rax) + adc $0, R32(%rax) + + FUNC_EXIT() + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/aorsmul_1.asm b/gcc/gmp/mpn/x86_64/bt1/aorsmul_1.asm new file mode 100644 index 0000000..41e1d8a 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/aorsmul_1.asm @@ -1,0 +1,191 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.52 old measurement +C AMD K10 4.51 old measurement +C AMD bd1 4.66 old measurement +C AMD bd2 4.57 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.04 +C AMD bt2 5.07 +C Intel P4 16.8 18.6 old measurement +C Intel PNR 5.59 old measurement +C Intel NHM 5.39 old measurement +C Intel SBR 3.93 old measurement +C Intel IBR 3.59 old measurement +C Intel HWL 3.61 old measurement +C Intel BWL 2.76 old measurement +C Intel SKL 2.77 old measurement +C Intel atom 23 old measurement +C Intel SLM 8 old measurement +C Intel GLM ? +C VIA nano 5.63 old measurement + +C The ALIGNment here might look completely ad-hoc. They are not. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(func) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + je L(b01) + +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -24(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + ADDSUB w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + ADDSUB w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): xor R32(%rax), R32(%rax) + ADDSUB w2, -8(rp) + adc w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): ADDSUB %rax, -8(rp) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/copyd.asm b/gcc/gmp/mpn/x86_64/bt1/copyd.asm new file mode 100644 index 0000000..877714e 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/copyd.asm @@ -1,0 +1,91 @@ +dnl AMD64 mpn_copyd optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + sub $4, n + jl L(end) + ALIGN(16) +L(top): mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) +L(ent): sub $4, n + jge L(top) + +L(end): cmp $-4, R32(n) + jz L(ret) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) + cmp $-3, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + cmp $-2, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/copyi.asm b/gcc/gmp/mpn/x86_64/bt1/copyi.asm new file mode 100644 index 0000000..ee0f578 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/copyi.asm @@ -1,0 +1,94 @@ +dnl AMD64 mpn_copyi optimised for AMD bobcat. + +dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 1 +C AMD K10 1-2 (alignment fluctuations) +C AMD bd1 ? +C AMD bobcat 1.5 +C Intel P4 2.8 +C Intel core2 1 +C Intel NHM 1-1.25 +C Intel SBR 1 +C Intel atom 2.87 +C VIA nano 2 + +C INPUT PARAMETERS +C rp rdi +C up rsi +C n rdx + +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + lea -32(up,n,8), up + lea -32(rp,n,8), rp + neg n + add $4, n + jg L(end) + ALIGN(16) +L(top): mov (up,n,8), %r8 + mov %r8, (rp,n,8) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + mov 24(up,n,8), %r8 + mov %r8, 24(rp,n,8) +L(ent): add $4, n + jle L(top) + +L(end): cmp $4, R32(n) + jz L(ret) + mov (up,n,8), %r8 + mov %r8, (rp,n,8) + cmp $3, R32(n) + jz L(ret) + mov 8(up,n,8), %r8 + mov %r8, 8(rp,n,8) + cmp $2, R32(n) + jz L(ret) + mov 16(up,n,8), %r8 + mov %r8, 16(rp,n,8) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/gcd_11.asm b/gcc/gmp/mpn/x86_64/bt1/gcd_11.asm new file mode 100644 index 0000000..ef53392 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/gcd_11.asm @@ -1,0 +1,119 @@ +dnl AMD64 mpn_gcd_11 -- 1 x 1 gcd. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 5.4 +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM ? +C Intel WSM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0. + +deflit(MAXSHIFT, 8) +deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1)) + +DEF_OBJECT(ctz_table,64) + .byte MAXSHIFT +forloop(i,1,MASK, +` .byte m4_count_trailing_zeros(i) +') +END_OBJECT(ctz_table) + +define(`u0', `%rdi') +define(`v0', `%rsi') + +define(`cnt', `%rcx') +define(`s0', `%rax') +define(`t0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + LEA( ctz_table, %r10) + mov v0, t0 + sub u0, t0 + jz L(end) + + ALIGN(16) +L(top): mov u0, s0 + sub v0, u0 + cmovc t0, u0 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + and $MASK, R32(t0) + movzbl (%r10,t0), R32(cnt) + jz L(count_better) +L(shr): shr R8(cnt), u0 + mov v0, t0 + sub u0, t0 + jnz L(top) + +L(end): mov v0, %rax + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret + +L(count_better): + bsf u0, cnt + jmp L(shr) +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/gcd_22.asm b/gcc/gmp/mpn/x86_64/bt1/gcd_22.asm new file mode 100644 index 0000000..c9f221e 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/gcd_22.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/gcd_22.asm') diff --git a/gcc/gmp/mpn/x86_64/bt1/gmp-mparam.h b/gcc/gmp/mpn/x86_64/bt1/gmp-mparam.h new file mode 100644 index 0000000..977a209 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/gmp-mparam.h @@ -1,0 +1,230 @@ +/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 1600 MHz AMD Bobcat/Zacate */ +/* FFT tuning limit = 110,472,704 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 71 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 270 + +#define MUL_TOOM22_THRESHOLD 24 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 274 +#define MUL_TOOM8H_THRESHOLD 381 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 127 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 131 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 101 +#define SQR_TOOM4_THRESHOLD 278 +#define SQR_TOOM6_THRESHOLD 372 +#define SQR_TOOM8_THRESHOLD 478 + +#define MULMID_TOOM42_THRESHOLD 22 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 444 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 444, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83, 5}, { 1343, 4}, \ + { 2687, 5}, { 1407, 6}, { 735, 7}, { 415, 8}, \ + { 223,10}, { 79,11}, { 47,10}, { 103,12}, \ + { 31,11}, { 63,10}, { 135,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 25, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 7}, { 575, 8}, \ + { 303, 9}, { 159,10}, { 103,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383,11}, { 207,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671,11}, \ + { 351,10}, { 703,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,12}, { 223,11}, { 447,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 607,13}, { 319,12}, { 703,13}, \ + { 383,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,15}, { 1791,14}, \ + { 3839,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 16384,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 186 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 42 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 7 +#define SQRLO_DC_THRESHOLD 100 +#define SQRLO_SQR_THRESHOLD 7293 + +#define DC_DIV_QR_THRESHOLD 70 +#define DC_DIVAPPR_Q_THRESHOLD 204 +#define DC_BDIV_QR_THRESHOLD 59 +#define DC_BDIV_Q_THRESHOLD 148 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 246 +#define INV_APPR_THRESHOLD 236 + +#define BINV_NEWTON_THRESHOLD 252 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1589 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1442 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,960,1603,1811,2499 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 34 +#define SET_STR_DC_THRESHOLD 345 +#define SET_STR_PRECOMPUTE_THRESHOLD 1787 + +#define FAC_DSC_THRESHOLD 781 +#define FAC_ODD_THRESHOLD 104 + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 3 /* 3.20% faster than 5 */ +#define HGCD_THRESHOLD 110 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 474 +#define GCDEXT_DC_THRESHOLD 293 +#define JACOBI_BASE_METHOD 2 /* 9.38% faster than 1 */ + +/* Tuneup completed successfully, took 358881 seconds */ diff --git a/gcc/gmp/mpn/x86_64/bt1/mul_1.asm b/gcc/gmp/mpn/x86_64/bt1/mul_1.asm new file mode 100644 index 0000000..4394d6e 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/mul_1.asm @@ -1,0 +1,241 @@ +dnl AMD64 mpn_mul_1 optimised for AMD bt1/bt2. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.53 old measurement +C AMD K10 4.53 old measurement +C AMD bd1 4.56 old measurement +C AMD bd2 4.47 old measurement +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bt1 5.12 +C AMD bt2 5.17 +C Intel P4 12.6 old measurement +C Intel PNR 4.53 old measurement +C Intel NHM 4.36 old measurement +C Intel SBR 3.0 old measurement +C Intel IBR 2.55 old measurement +C Intel HWL 2.28 old measurement +C Intel BWL 2.36 old measurement +C Intel SKL 2.39 old measurement +C Intel atom 21.0 old measurement +C Intel SLM 9 old measurement +C Intel GLM ? +C VIA nano ? + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') +define(`v0', `%rcx') +define(`cy', `%r8') +C Standard allocations +define(`n', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C DOS64 parameters +IFDOS(` define(`rp', `%rcx') ') dnl +IFDOS(` define(`up', `%rsi') ') dnl +IFDOS(` define(`n_param', `%r8') ') dnl +IFDOS(` define(`v0', `%r9') ') dnl +IFDOS(` define(`cy', `56(%rsp)')') dnl +C DOS64 allocations +IFDOS(` define(`n', `%rbx') ') dnl +IFDOS(` define(`w0', `%r8') ') dnl +IFDOS(` define(`w1', `%rdi') ') dnl +IFDOS(` define(`w2', `%r10') ') dnl +IFDOS(` define(`w3', `%r11') ') dnl + + ALIGN(64) +PROLOGUE(mpn_mul_1) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(bx1) + +L(bx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + test $2, R8(n) + jne L(L2) + +L(b00): add $2, n + jmp L(L0) + + ALIGN(16) +L(b11): mov %rax, w2 + mov %rdx, w3 + neg n + inc n + jmp L(L3) + + ALIGN(16) +L(bx1): mul v0 + test $2, R8(n) + jne L(b11) + +L(b01): sub $3, n + jc L(n1) + mov %rax, w2 + mov %rdx, w3 + neg n + + ALIGN(16) +L(top): mov -16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -24(rp,n,8) + add w3, w0 + adc $0, w1 +L(L0): mov -8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 +L(L2): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 + add $4, n + js L(top) + +L(end): mov w2, -8(rp) + mov w3, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret + + ALIGN(32) +L(n1): mov %rax, -8(rp) + mov %rdx, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_mul_1c) +IFDOS(` push %rsi ') +IFDOS(` push %rdi ') +IFDOS(` mov %rdx, %rsi ') + mov cy, w2 + push %rbx + mov (up), %rax + + lea (rp,n_param,8), rp + lea (up,n_param,8), up + mov n_param, n + + test $1, R8(n_param) + jne L(cx1) + +L(cx0): mul v0 + neg n + mov %rax, w0 + mov %rdx, w1 + add w2, w0 + adc $0, w1 + test $2, R8(n) + jne L(L2) + +L(c00): add $2, n + jmp L(L0) + + ALIGN(16) +L(cx1): mul v0 + test $2, R8(n) + je L(c01) + +L(c11): neg n + inc n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(L3) + +L(c01): cmp $1, n + jz L(m1) + neg n + add $3, n + add %rax, w2 + mov %rdx, w3 + adc $0, w3 + jmp L(top) + + ALIGN(32) +L(m1): add %rax, w2 + mov %rdx, %rax + mov w2, -8(rp) + adc $0, %rax + pop %rbx +IFDOS(` pop %rdi ') +IFDOS(` pop %rsi ') + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/mul_basecase.asm b/gcc/gmp/mpn/x86_64/bt1/mul_basecase.asm new file mode 100644 index 0000000..e7d46bf 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/mul_basecase.asm @@ -1,0 +1,486 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This mul_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 3 code. +C * Fix slowdown for un=vn=3 (67->71) compared to default code. +C * This is 1263 bytes, compared to 1099 bytes for default code. Consider +C combining addmul loops like that code. Tolerable slowdown? +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') + + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (up), %rax + mov (vp), v0 + + cmp $2, un_param + ja L(ge3) + jz L(u2) + + mul v0 C u0 x v0 + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(u2): mul v0 C u0 x v0 + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + cmp $1, R32(vn) + jnz L(u2v2) + mov w0, 8(rp) + mov w1, 16(rp) + FUNC_EXIT() + ret + +L(u2v2):mov 8(vp), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, %r8 C CAUTION: r8 realloc + adc $0, %r8 + mov 8(up), %rax + mul v0 + add w1, %r8 + adc $0, %rdx + add %r8, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(ge3): push %rbx + push %rbp + push %r12 + push %r13 + + lea 8(vp), vp + + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + xor R32(un), R32(un) + mov $2, R32(n) + sub un_param, un + sub un_param, n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + +C Switch on n into right addmul_l loop + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + jmp L(r1) + + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $2, un + +C outer loop(3) +L(to3): dec vn + jz L(ret) + mov (vp), v0 + mov 8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to3) + + +L(r2): mov X(0(up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + inc un + +C outer loop(2) +L(to2): dec vn + jz L(ret) + mov (vp), v0 + mov 16(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to2) + + +L(r1): mov X(0(up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $4, un + +C outer loop(1) +L(to1): dec vn + jz L(ret) + mov (vp), v0 + mov -8(up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to1) + + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $3, un + +C outer loop(0) +L(to0): dec vn + jz L(ret) + mov (vp), v0 + mov (up,un,8), %rax + lea 8(vp), vp + lea 8(rp), rp + mov un, n + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(to0) + + +L(ret): pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt1/redc_1.asm b/gcc/gmp/mpn/x86_64/bt1/redc_1.asm new file mode 100644 index 0000000..d55b1e5 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/redc_1.asm @@ -1,0 +1,507 @@ +dnl X86-64 mpn_redc_1 optimised for AMD bobcat. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 +L(e1): mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp1) + +L(ed1): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w1 + add (up,n,8), w2 + adc w3, %rbx + adc $0, w1 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w1, w2 + adc $0, w3 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 +L(e3): mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp3) + +L(ed3): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): +L(otp0):lea (n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e0): mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp0) + +L(ed0): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + mov 8(mp,n,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, w3 + add (up,n,8), w0 + adc w1, %rbx + adc $0, w3 + mov 16(mp,n,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + adc w3, w0 + adc $0, w1 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): add w0, -16(up,i,8) + adc w1, w2 + adc $0, w3 + mov (mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(up,i,8) + adc w3, w0 + adc $0, w1 +L(e2): mov 8(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add w0, (up,i,8) + adc w1, w2 + adc $0, w3 + mov 16(mp,i,8), %rax + mul q0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(up,i,8) + adc w3, w0 + adc $0, w1 + mov 24(mp,i,8), %rax + mul q0 + mov %rax, w2 + mov %rdx, w3 + add $4, i + js L(tp2) + +L(ed2): add w0, I(-16(up),-16(up,i,8)) + adc w1, w2 + adc $0, w3 + add w2, I(-8(up),-8(up,i,8)) + adc $0, w3 + mov w3, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/x86_64/bt1/sqr_basecase.asm b/gcc/gmp/mpn/x86_64/bt1/sqr_basecase.asm new file mode 100644 index 0000000..0e417a1 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt1/sqr_basecase.asm @@ -1,0 +1,565 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bd1 4.75 +C AMD bobcat 5 +C Intel P4 17.7 +C Intel core2 5.5 +C Intel NHM 5.43 +C Intel SBR 3.92 +C Intel atom 23 +C VIA nano 5.63 + +C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the +C multiply insn bandwidth, without any apparent loop branch exit pipeline +C replays experienced on K8. The structure is unusual: it falls into mul_1 in +C the same way for all n, then it splits into 4 different wind-down blocks and +C 4 separate addmul_1 loops. +C +C We have not tried using the same addmul_1 loops with a switch into feed-in +C code, as we do in other basecase implementations. Doing that could save +C substantial code volume, but would also probably add some overhead. + +C TODO +C * Tune un < 4 code. +C * Perhaps implement a larger final corner (it is now 2 x 1). +C * Lots of space could be saved by replacing the "switch" code by gradual +C jumps out from mul_1 winddown code, perhaps with no added overhead. +C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +C Standard parameters +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +C Standard allocations +define(`un', `%rbx') +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') +define(`n', `%rbp') +define(`v0', `%rcx') + +C Temp macro for allowing control over indexing. +C Define to return $1 for more conservative ptr handling. +define(`X',`$2') +dnl define(`X',`$1') + + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + mov (up), %rax + + cmp $2, R32(un_param) + jae L(ge2) + + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(ge2): mov (up), v0 + jnz L(g2) + + mul %rax + mov %rax, (rp) + mov 8(up), %rax + mov %rdx, w0 + mul v0 + add %rax, w0 + mov %rdx, w1 + adc $0, w1 + mov 8(up), v0 + mov (up), %rax + mul v0 + add %rax, w0 + mov w0, 8(rp) + mov %rdx, w0 C CAUTION: r8 realloc + adc $0, w0 + mov 8(up), %rax + mul v0 + add w1, w0 + adc $0, %rdx + add w0, %rax + adc $0, %rdx + mov %rax, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(g2): cmp $3, R32(un_param) + ja L(g3) + mul %rax + mov %rax, (rp) + mov %rdx, 8(rp) + mov 8(up), %rax + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov (up), v0 + mov 8(up), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov 16(up), %rax + mul v0 + xor R32(w2), R32(w2) + add %rax, w1 + adc %rdx, w2 + + mov 8(up), v0 + mov 16(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, w2 + adc %rdx, w3 + add w0, w0 + adc w1, w1 + adc w2, w2 + adc w3, w3 + mov $0, R32(v0) + adc v0, v0 + add w0, 8(rp) + adc w1, 16(rp) + adc w2, 24(rp) + adc w3, 32(rp) + adc v0, 40(rp) + FUNC_EXIT() + ret + +L(g3): push %rbx + push %rbp + + mov 8(up), %rax + lea -24(rp,un_param,8), rp + lea -24(up,un_param,8), up + neg un_param + push un_param C for sqr_diag_addlsh1 + lea (un_param), un + lea 3(un_param), n + + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(top): mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, -8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, (rp,n,8) + add w1, w2 + adc $0, w3 +L(L3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, 8(rp,n,8) + add w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(top) + + mov w0, -16(rp,n,8) + add w1, w2 + adc $0, w3 + + test n, n + jz L(r2) + cmp $2, R32(n) + ja L(r3) + jz L(r0) + + +L(r1): mov X((up,n,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),8(rp)) + add w1, w2 + adc $0, w3 + mov w2, X(8(rp,n,8),16(rp)) + mov w3, X(16(rp,n,8),24(rp)) + add $5, un + jmp L(to0) + +L(r2): mov X((up,n,8),(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),-8(rp)) + add w3, w0 + adc $0, w1 + mov X(8(up,n,8),8(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + mov w0, X((rp,n,8),(rp)) + add w1, w2 + adc $0, w3 + mov X(16(up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X(16(rp,n,8),16(rp)) + adc $0, w3 + mov w1, X(24(rp,n,8),24(rp)) + add $6, un + jmp L(to1) + +L(r3): mov w2, X(-8(rp,n,8),16(rp)) + mov w3, X((rp,n,8),24(rp)) + add $3, un + jmp L(to2) + +L(r0): mov X((up,n,8),16(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov w2, X(-8(rp,n,8),8(rp)) + add w3, w0 + adc $0, w1 + mov w0, X((rp,n,8),16(rp)) + mov w1, X(8(rp,n,8),24(rp)) + add $4, un +C jmp L(to3) +C fall through into main loop + + +L(outer): + mov un, n + mov (up,un,8), v0 + mov 8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al3) + + ALIGN(16) +L(ta3): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 +L(al3): mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta3) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to2): mov un, n + cmp $-4, R32(un) + jnc L(end) + add $4, un + mov 8(up,n,8), v0 + mov 16(up,n,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al2) + + ALIGN(16) +L(ta2): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al2): mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta2) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to1): mov un, n + mov -16(up,un,8), v0 + mov -8(up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w2 + mov %rdx, w3 + jmp L(al1) + + ALIGN(16) +L(ta1): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 +L(al1): mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta1) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + + +L(to0): mov un, n + mov -8(up,un,8), v0 + mov (up,un,8), %rax + lea 8(rp), rp + mul v0 + mov %rax, w0 + mov %rdx, w1 + jmp L(al0) + + ALIGN(16) +L(ta0): add w0, -16(rp,n,8) + adc w1, w2 + adc $0, w3 + mov (up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, -8(rp,n,8) + adc w3, w0 + adc $0, w1 +L(al0): mov 8(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, (rp,n,8) + adc w1, w2 + adc $0, w3 + mov 16(up,n,8), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + add w2, 8(rp,n,8) + adc w3, w0 + adc $0, w1 + mov 24(up,n,8), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add $4, n + js L(ta0) + + add w0, X(-16(rp,n,8),8(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(-8(rp,n,8),16(rp)) + adc $0, w3 + mov w3, X((rp,n,8),24(rp)) + jmp L(outer) + + +L(end): mov X(8(up,un,8),(up)), v0 + mov X(16(up,un,8),8(up)), %rax + mul v0 + mov %rax, w0 + mov %rdx, w1 + mov X(24(up,un,8),16(up)), %rax + mul v0 + mov %rax, w2 + mov %rdx, w3 + add w0, X(24(rp,un,8),16(rp)) + adc w1, w2 + adc $0, w3 + add w2, X(32(rp,un,8),24(rp)) + adc $0, w3 + mov X(16(up,un,8),8(up)), v0 + mov X(24(up,un,8),16(up)), %rax + mul v0 + add %rax, w3 + mov w3, X(40(rp,un,8),32(rp)) + adc $0, %rdx + mov %rdx, X(48(rp,un,8),40(rp)) + + +C sqr_diag_addlsh1 + + lea 16(up), up + lea 40(rp), rp + pop n + lea 2(n,n), n + + mov (up,n,4), %rax + mul %rax + xor R32(w2), R32(w2) + + mov 8(rp,n,8), w0 + mov %rax, (rp,n,8) + jmp L(lm) + + ALIGN(8) +L(tsd): add %rbx, w0 + adc %rax, w1 + mov w0, -8(rp,n,8) + mov 8(rp,n,8), w0 + mov w1, (rp,n,8) +L(lm): mov 16(rp,n,8), w1 + adc w0, w0 + adc w1, w1 + lea (%rdx,w2), %rbx + mov 8(up,n,4), %rax + setc R8(w2) + mul %rax + add $2, n + js L(tsd) + +L(esd): add %rbx, w0 + adc %rax, w1 + mov w0, X(-8(rp,n,8),-8(rp)) + mov w1, X((rp,n,8),(rp)) + adc w2, %rdx + mov %rdx, X(8(rp,n,8),8(rp)) + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/bt2/com.asm b/gcc/gmp/mpn/x86_64/bt2/com.asm new file mode 100644 index 0000000..87085ea 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/com.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gcc/gmp/mpn/x86_64/bt2/copyd.asm b/gcc/gmp/mpn/x86_64/bt2/copyd.asm new file mode 100644 index 0000000..83c0618 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/copyd.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_copyd. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gcc/gmp/mpn/x86_64/bt2/copyi.asm b/gcc/gmp/mpn/x86_64/bt2/copyi.asm new file mode 100644 index 0000000..148d0e5 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/copyi.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_copyi. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gcc/gmp/mpn/x86_64/bt2/gcd_11.asm b/gcc/gmp/mpn/x86_64/bt2/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/bt2/gcd_22.asm b/gcc/gmp/mpn/x86_64/bt2/gcd_22.asm new file mode 100644 index 0000000..d693628 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/gcd_22.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/bd2/gcd_22.asm') diff --git a/gcc/gmp/mpn/x86_64/bt2/gmp-mparam.h b/gcc/gmp/mpn/x86_64/bt2/gmp-mparam.h new file mode 100644 index 0000000..3e26726 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/bt2/gmp-mparam.h @@ -1,0 +1,240 @@ +/* AMD Jaguar gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2050 MHz AMD Jaguar/Kabini */ +/* FFT tuning limit = 225,381,546 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 4 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 65 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 4 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 15 + +#define DIV_1_VS_MUL_1_PERCENT 267 + +#define MUL_TOOM22_THRESHOLD 25 +#define MUL_TOOM33_THRESHOLD 32 +#define MUL_TOOM44_THRESHOLD 93 +#define MUL_TOOM6H_THRESHOLD 366 +#define MUL_TOOM8H_THRESHOLD 537 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 63 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 172 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 63 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 67 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 20 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 220 +#define SQR_TOOM6_THRESHOLD 318 +#define SQR_TOOM8_THRESHOLD 434 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 11 +#define SQRMOD_BNM1_THRESHOLD 13 + +#define MUL_FFT_MODF_THRESHOLD 348 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 348, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 23, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 55,11}, \ + { 31,10}, { 63, 6}, { 1087, 8}, { 303, 9}, \ + { 159,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ + { 383,10}, { 207, 9}, { 415,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ + { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ + { 191,10}, { 383,11}, { 207,10}, { 415,11}, \ + { 223,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,12}, \ + { 607,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 831,13}, { 447,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1023,13}, { 575,12}, { 1151,13}, \ + { 639,12}, { 1279,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,13}, { 895,15}, { 255,14}, \ + { 511,13}, { 1087,12}, { 2175,13}, { 1151,14}, \ + { 639,13}, { 1343,12}, { 2687,14}, { 767,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,15}, { 767,14}, { 1663,13}, { 3327,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4351,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,13}, { 8447,14}, { 4479,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3071,14}, \ + { 6271,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8447,15}, { 4351,14}, { 8959,15}, { 4863,16}, \ + { 2559,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 201 +#define MUL_FFT_THRESHOLD 3200 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 23, 7}, { 12, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95, 6}, \ + { 1663, 7}, { 895, 9}, { 239, 8}, { 479,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 135, 9}, \ + { 271,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \ + { 303, 9}, { 607,10}, { 319, 9}, { 639,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ + { 223,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 607,12}, { 319,11}, \ + { 639,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1151,14}, { 639,13}, { 1343,12}, { 2687,13}, \ + { 1407,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,14}, { 1279,13}, \ + { 2687,14}, { 1407,15}, { 767,14}, { 1535,13}, \ + { 3199,14}, { 1663,13}, { 3455,16}, { 511,15}, \ + { 1023,14}, { 2175,13}, { 4479,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4991,15}, { 2815,14}, \ + { 5887,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 208 +#define SQR_FFT_THRESHOLD 2880 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 63 +#define MULLO_MUL_N_THRESHOLD 6253 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 54 +#define SQRLO_SQR_THRESHOLD 5558 + +#define DC_DIV_QR_THRESHOLD 72 +#define DC_DIVAPPR_Q_THRESHOLD 195 +#define DC_BDIV_QR_THRESHOLD 50 +#define DC_BDIV_Q_THRESHOLD 90 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 195 +#define INV_APPR_THRESHOLD 197 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 67 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1334 +#define MU_DIVAPPR_Q_THRESHOLD 1334 +#define MUPI_DIV_QR_THRESHOLD 104 +#define MU_BDIV_QR_THRESHOLD 1017 +#define MU_BDIV_Q_THRESHOLD 1187 + +#define POWM_SEC_TABLE 1,16,194,712,779,2387 + +#define GET_STR_DC_THRESHOLD 15 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 216 +#define SET_STR_PRECOMPUTE_THRESHOLD 994 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 17 +#define HGCD2_DIV1_METHOD 1 /* 9.38% faster than 3 */ +#define HGCD_THRESHOLD 77 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 273 +#define JACOBI_BASE_METHOD 1 /* 7.74% faster than 4 */ + +/* Tuneup completed successfully, took 495910 seconds */ diff --git a/gcc/gmp/mpn/x86_64/core2/com.asm b/gcc/gmp/mpn/x86_64/core2/com.asm new file mode 100644 index 0000000..d7d9f79 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/core2/com.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_com. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/gcc/gmp/mpn/x86_64/core2/gcd_11.asm b/gcc/gmp/mpn/x86_64/core2/gcd_11.asm new file mode 100644 index 0000000..b00451f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/core2/gcd_11.asm @@ -1,0 +1,93 @@ +dnl AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for AMD64 by Torbjorn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 4.22 * +C Intel PNR 4.22 * +C Intel NHM 4.97 +C Intel WSM 5.17 +C Intel SBR 4.83 * +C Intel IBR 4.16 * +C Intel HWL 3.84 +C Intel BWL 3.76 +C Intel SKL 3.83 +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + +define(`u0', `%rdi') +define(`v0', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + FUNC_ENTRY(2) + jmp L(odd) + + ALIGN(16) +L(top): cmovc %rdx, u0 C u = |u - v| + cmovc %rax, v0 C v = min(u,v) + shr R8(%rcx), u0 +L(odd): mov v0, %rdx + sub u0, %rdx C v - u + bsf %rdx, %rcx + mov u0, %rax + sub v0, u0 C u - v + jnz L(top) + +L(end): C rax = result + C rdx = 0 for the benefit of internal gcd_22 call + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/core2/gcd_22.asm b/gcc/gmp/mpn/x86_64/core2/gcd_22.asm new file mode 100644 index 0000000..b5aa73b 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/core2/gcd_22.asm @@ -1,0 +1,137 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useful shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR 8.7 +C Intel PNR 8.7 +C Intel NHM 9.2 +C Intel WSM 9.2 +C Intel SBR 9.1 +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shrd R8(cnt), u1, u0 + shr R8(cnt), u1 + + mov v1, t1 + or u1, t1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/core2/hamdist.asm b/gcc/gmp/mpn/x86_64/core2/hamdist.asm new file mode 100644 index 0000000..a78753d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/core2/hamdist.asm @@ -1,0 +1,210 @@ +dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 n/a +C Intel CNR 4.50 y +C Intel PNR 3.28 y +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + movq (vp), %xmm10 + add $8, vp + pxor %xmm10, %xmm1 + jmp L(e1) + +L(2): add $-48, up + add $-48, vp + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + movq (vp), %xmm10 + add $-40, vp + pxor %xmm10, %xmm1 + jmp L(e3) + +L(4): add $-32, up + add $-32, vp + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + movq (vp), %xmm10 + add $-24, vp + pxor %xmm10, %xmm1 + jmp L(e5) + +L(6): add $-16, up + add $-16, vp + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + movq (vp), %xmm10 + add $-8, vp + pxor %xmm10, %xmm1 + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 + lddqu (vp), %xmm10 + pxor %xmm10, %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 + lddqu 16(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 + lddqu 32(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up + lddqu 48(vp), %xmm10 + add $64, vp + pxor %xmm10, %xmm1 +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movq %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) diff --git a/gcc/gmp/mpn/x86_64/core2/logops_n.asm b/gcc/gmp/mpn/x86_64/core2/logops_n.asm new file mode 100644 index 0000000..5ff174c 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/core2/logops_n.asm @@ -1,0 +1,285 @@ +dnl AMD64 logops. + +dnl Copyright 2004-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l c/l c/l good +C var-1 var-2 var-3 for cpu? +C AMD K8,K9 +C AMD K10 1.52 1.75 1.75 n +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD bt1 2.67 ~2.79 ~2.79 = +C AMD bt2 2.15 2.65 2.65 n +C AMD zen 1.5 1.5 1.5 = +C Intel P4 +C Intel PNR 2.0 2.0 2.0 = +C Intel NHM 2.0 2.0 2.0 = +C Intel SBR 1.5 1.5 1.5 y +C Intel IBR 1.47 1.48 1.48 y +C Intel HWL 1.11 1.35 1.35 y +C Intel BWL 1.09 1.30 1.30 y +C Intel SKL 1.21 1.27 1.27 y +C Intel atom 3.31 3.57 3.57 y +C Intel SLM 3.0 3.0 3.0 = +C VIA nano + +ifdef(`OPERATION_and_n',` + define(`func',`mpn_and_n') + define(`VARIANT_1') + define(`LOGOP',`and')') +ifdef(`OPERATION_andn_n',` + define(`func',`mpn_andn_n') + define(`VARIANT_2') + define(`LOGOP',`and')') +ifdef(`OPERATION_nand_n',` + define(`func',`mpn_nand_n') + define(`VARIANT_3') + define(`LOGOP',`and')') +ifdef(`OPERATION_ior_n',` + define(`func',`mpn_ior_n') + define(`VARIANT_1') + define(`LOGOP',`or')') +ifdef(`OPERATION_iorn_n',` + define(`func',`mpn_iorn_n') + define(`VARIANT_2') + define(`LOGOP',`or')') +ifdef(`OPERATION_nior_n',` + define(`func',`mpn_nior_n') + define(`VARIANT_3') + define(`LOGOP',`or')') +ifdef(`OPERATION_xor_n',` + define(`func',`mpn_xor_n') + define(`VARIANT_1') + define(`LOGOP',`xor')') +ifdef(`OPERATION_xnor_n',` + define(`func',`mpn_xnor_n') + define(`VARIANT_2') + define(`LOGOP',`xor')') + +define(`addptr', `lea $1($2), $2') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + +ifdef(`VARIANT_1',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_2',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + not %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 + not %r8 +L(b00): mov 8(vp), %r9 + not %r9 + LOGOP (up), %r8 + LOGOP 8(up), %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 + not %r8 +L(e10): mov 24(vp), %r9 + not %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') + +ifdef(`VARIANT_3',` + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (vp), %r8 + mov R32(%rcx), R32(%rax) + and $3, R32(%rax) + je L(b00) + cmp $2, R32(%rax) + jc L(b01) + je L(b10) + +L(b11): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + inc n + addptr( -8, up) + addptr( -8, vp) + addptr( -8, rp) + jmp L(e11) +L(b10): add $2, n + addptr( -16, up) + addptr( -16, vp) + addptr( -16, rp) + jmp L(e10) +L(b01): LOGOP (up), %r8 + not %r8 + mov %r8, (rp) + dec n + jz L(ret) + addptr( 8, up) + addptr( 8, vp) + addptr( 8, rp) + + ALIGN(16) +L(top): mov (vp), %r8 +L(b00): mov 8(vp), %r9 + LOGOP (up), %r8 + not %r8 + LOGOP 8(up), %r9 + not %r9 + mov %r8, (rp) + mov %r9, 8(rp) +L(e11): mov 16(vp), %r8 +L(e10): mov 24(vp), %r9 + addptr( 32, vp) + LOGOP 16(up), %r8 + not %r8 + LOGOP 24(up), %r9 + addptr( 32, up) + not %r9 + mov %r8, 16(rp) + mov %r9, 24(rp) + addptr( 32, rp) + sub $4, n + jnz L(top) + +L(ret): FUNC_EXIT() + ret +EPILOGUE() +') diff --git a/gcc/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm b/gcc/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm new file mode 100644 index 0000000..5cdb209 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -1,0 +1,395 @@ +dnl X64-64 mpn_mullo_basecase optimised for Intel Broadwell. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`jmpreg',`%rbx') +define(`nn', `%rbp') + +C TODO +C * Suppress more rp[] rewrites in corner. +C * Rearrange feed-in jumps for short branch forms. +C * Perhaps roll out the heavy artillery and 8-way unroll outer loop. Since +C feed-in code implodes, the blow-up will not be more than perhaps 4x. +C * Micro-optimise critical lead-in code block around L(ent). +C * Write n < 4 code specifically for Broadwell (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r14 + push %r12 + push %rbx + push %rbp + mov -8(vp_param,n,8), %r14 C FIXME Put at absolute end + imul (up), %r14 C FIXME Put at absolute end + lea -3(n), R32(nn) + lea 8(vp_param), vp + mov (vp_param), %rdx + + mov R32(n), R32(%rax) + shr $3, R32(n) + and $7, R32(%rax) C clear OF, CF as side-effect + lea L(mtab)(%rip), %r10 +ifdef(`PIC', +` movslq (%r10,%rax,4), %rax + lea (%rax, %r10), %r10 + jmp *%r10 +',` + jmp *(%r10,%rax,8) +') + +L(mf0): mulx( (up), %r10, %r8) + lea 56(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx( (up), %r9, %rax) + lea 16(up), up + lea 16(rp), rp + jrcxz L(mc) + inc R32(n) + lea L(f2)(%rip), jmpreg + jmp L(mb3) + +L(mc): mulx( -8,(up), %r10, %r8) + add %rax, %r10 + mov %r9, -16(rp) + mulx( (up), %r9, %rax) + mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + jmp L(c2) + +L(mf4): mulx( (up), %r10, %r8) + lea 24(up), up + lea 24(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx( (up), %r9, %rax) + lea 32(up), up + lea 32(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx( (up), %r10, %r8) + lea 40(up), up + lea 40(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx( (up), %r9, %rax) + lea 48(up), up + lea 48(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx( (up), %r9, %rax) + lea L(f0)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + lea L(f1)(%rip), jmpreg + mulx( (up), %r9, %rax) + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(mtop):mov %r10, -8(rp) + adc %r8, %r9 +L(mb1): mulx( 8,(up), %r10, %r8) + adc %rax, %r10 + lea 64(up), up + mov %r9, (rp) +L(mb0): mov %r10, 8(rp) + mulx( -48,(up), %r9, %rax) + lea 64(rp), rp + adc %r8, %r9 +L(mb7): mulx( -40,(up), %r10, %r8) + mov %r9, -48(rp) + adc %rax, %r10 +L(mb6): mov %r10, -40(rp) + mulx( -32,(up), %r9, %rax) + adc %r8, %r9 +L(mb5): mulx( -24,(up), %r10, %r8) + mov %r9, -32(rp) + adc %rax, %r10 +L(mb4): mulx( -16,(up), %r9, %rax) + mov %r10, -24(rp) + adc %r8, %r9 +L(mb3): mulx( -8,(up), %r10, %r8) + adc %rax, %r10 + mov %r9, -16(rp) + dec R32(n) + mulx( (up), %r9, %rax) + jnz L(mtop) + +L(mend):mov %r10, -8(rp) + adc %r8, %r9 + mov %r9, (rp) + adc %rcx, %rax + + lea 8(,nn,8), %r12 + neg %r12 + shr $3, R32(nn) + jmp L(ent) + +L(f0): mulx( (up), %r10, %r8) + lea -8(up), up + lea -8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f1): mulx( (up), %r9, %rax) + lea -1(nn), R32(nn) + lea L(f0)(%rip), jmpreg + jmp L(b1) + +L(end): adox( (rp), %r9) + mov %r9, (rp) + adox( %rcx, %rax) C relies on rcx = 0 + adc %rcx, %rax C FIXME suppress, use adc below; reqs ent path edits + lea 8(%r12), %r12 +L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) + add %rax, %r14 + add %r10, %r14 C h + lea (up,%r12), up C reset up + lea 8(rp,%r12), rp C reset rp + mov (vp), %rdx + lea 8(vp), vp + or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) + jmp *jmpreg + +L(f7): mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f2): mulx( (up), %r10, %r8) + lea 8(up), up + lea 8(rp), rp + mulx( (up), %r9, %rax) + lea L(f1)(%rip), jmpreg + +C FIXME ugly fallthrough FIXME + ALIGN(32) +L(top): adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + jrcxz L(end) +L(b1): mulx( 8,(up), %r10, %r8) + adox( (rp), %r9) + lea -1(n), R32(n) + mov %r9, (rp) + adcx( %rax, %r10) +L(b0): mulx( 16,(up), %r9, %rax) + adcx( %r8, %r9) + adox( 8,(rp), %r10) + mov %r10, 8(rp) +L(b7): mulx( 24,(up), %r10, %r8) + lea 64(up), up + adcx( %rax, %r10) + adox( 16,(rp), %r9) + mov %r9, 16(rp) +L(b6): mulx( -32,(up), %r9, %rax) + adox( 24,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 24(rp) +L(b5): mulx( -24,(up), %r10, %r8) + adcx( %rax, %r10) + adox( 32,(rp), %r9) + mov %r9, 32(rp) +L(b4): mulx( -16,(up), %r9, %rax) + adox( 40,(rp), %r10) + adcx( %r8, %r9) + mov %r10, 40(rp) +L(b3): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + jmp L(top) + +L(f6): mulx( (up), %r10, %r8) + lea 40(up), up + lea -24(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(f5): mulx( (up), %r9, %rax) + lea 32(up), up + lea -32(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx( (up), %r10, %r8) + lea 24(up), up + lea -40(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f3): mulx( (up), %r9, %rax) + lea 16(up), up + lea -48(rp), rp + jrcxz L(cor) + lea L(f2)(%rip), jmpreg + jmp L(b3) + +L(cor): adox( 48,(rp), %r9) + mulx( -8,(up), %r10, %r8) + mov %r9, 48(rp) + lea 64(rp), rp + adcx( %rax, %r10) + mulx( (up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) C FIXME suppress + adox( (rp), %r9) + mov %r9, (rp) C FIXME suppress + adox( %rcx, %rax) +L(c2): + mulx( 8,(up), %r10, %r8) + adc %rax, %r14 + add %r10, %r14 + mov (vp), %rdx + test R32(%rcx), R32(%rcx) + mulx( -16,(up), %r10, %r8) + mulx( -8,(up), %r9, %rax) + adox( -8,(rp), %r10) + adcx( %r8, %r9) + mov %r10, -8(rp) + adox( (rp), %r9) + adox( %rcx, %rax) + adc %rcx, %rax + mulx( (up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov 8(vp), %rdx + mulx( -16,(up), %rcx, %rax) + add %r9, %rcx + mov %rcx, (rp) + adc $0, %rax + mulx( -8,(up), %r10, %r8) + add %rax, %r14 + add %r10, %r14 + mov %r14, 8(rp) + pop %rbp + pop %rbx + pop %r12 + pop %r14 + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf7), L(mtab)) + JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) diff --git a/gcc/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm b/gcc/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm new file mode 100644 index 0000000..ff0d27b 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm @@ -1,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/zen/aorrlsh_n.asm') diff --git a/gcc/gmp/mpn/x86_64/coreihwl/aors_n.asm b/gcc/gmp/mpn/x86_64/coreihwl/aors_n.asm new file mode 100644 index 0000000..fc99627 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreihwl/aors_n.asm @@ -1,0 +1,261 @@ +dnl AMD64 mpn_add_n, mpn_sub_n + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 1.5 with fluctuations +C AMD bd2 1.5 with fluctuations +C AMD bd3 +C AMD bd4 1.6 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL 1.21 +C Intel BWL 1.04 +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C INPUT PARAMETERS +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) + + lea L(tab)(%rip), %r9 + neg %r8 C set carry +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') +EPILOGUE() + + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + + mov R32(n), R32(%rax) + shr $3, n + and $7, R32(%rax) C clear cy as side-effect + + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax C lea not add to preserve carry + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(0): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + jmp L(e0) + +L(4): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + lea -32(up), up + lea -32(vp), vp + lea -32(rp), rp + inc n + jmp L(e4) + +L(5): mov (up), %r11 + mov 8(up), %r8 + mov 16(up), %r9 + ADCSBB (vp), %r11 + lea -24(up), up + lea -24(vp), vp + lea -24(rp), rp + inc n + jmp L(e5) + +L(6): mov (up), %r10 + ADCSBB (vp), %r10 + mov 8(up), %r11 + lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + inc n + jmp L(e6) + +L(7): mov (up), %r9 + mov 8(up), %r10 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + inc n + jmp L(e7) + + ALIGN(16) +L(top): +L(e3): mov %r9, 40(rp) +L(e2): mov %r10, 48(rp) +L(e1): mov (up), %r8 + mov 8(up), %r9 + ADCSBB (vp), %r8 + mov %r11, 56(rp) + lea 64(rp), rp +L(e0): mov 16(up), %r10 + ADCSBB 8(vp), %r9 + ADCSBB 16(vp), %r10 + mov %r8, (rp) +L(e7): mov 24(up), %r11 + mov %r9, 8(rp) +L(e6): mov 32(up), %r8 + mov 40(up), %r9 + ADCSBB 24(vp), %r11 + mov %r10, 16(rp) +L(e5): ADCSBB 32(vp), %r8 + mov %r11, 24(rp) +L(e4): mov 48(up), %r10 + mov 56(up), %r11 + mov %r8, 32(rp) + lea 64(up), up + ADCSBB 40(vp), %r9 + ADCSBB 48(vp), %r10 + ADCSBB 56(vp), %r11 + lea 64(vp), vp + dec n + jnz L(top) + +L(end): mov %r9, 40(rp) + mov %r10, 48(rp) + mov %r11, 56(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(3): mov (up), %r9 + mov 8(up), %r10 + mov 16(up), %r11 + ADCSBB (vp), %r9 + ADCSBB 8(vp), %r10 + ADCSBB 16(vp), %r11 + jrcxz L(x3) + lea 24(up), up + lea 24(vp), vp + lea -40(rp), rp + jmp L(e3) +L(x3): mov %r9, (rp) + mov %r10, 8(rp) + mov %r11, 16(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(1): mov (up), %r11 + ADCSBB (vp), %r11 + jrcxz L(x1) + lea 8(up), up + lea 8(vp), vp + lea -56(rp), rp + jmp L(e1) +L(x1): mov %r11, (rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret + + ALIGN(16) +L(2): mov (up), %r10 + mov 8(up), %r11 + ADCSBB (vp), %r10 + ADCSBB 8(vp), %r11 + jrcxz L(x2) + lea 16(up), up + lea 16(vp), vp + lea -48(rp), rp + jmp L(e2) +L(x2): mov %r10, (rp) + mov %r11, 8(rp) + mov R32(n), R32(%rax) + adc R32(n), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gcc/gmp/mpn/x86_64/coreihwl/gcd_22.asm b/gcc/gmp/mpn/x86_64/coreihwl/gcd_22.asm new file mode 100644 index 0000000..b5863b6 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreihwl/gcd_22.asm @@ -1,0 +1,138 @@ +dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, useful tzcnt, shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 6.7 +C AMD bt1 - +C AMD bt2 - +C AMD zn1 5.4 +C AMD zn2 5.5 +C Intel P4 - +C Intel CNR - +C Intel PNR - +C Intel NHM - +C Intel WSM - +C Intel SBR - +C Intel IBR - +C Intel HWL 7.1 +C Intel BWL 5.5 +C Intel SKL 5.6 +C Intel atom - +C Intel SLM - +C Intel GLM - +C Intel GLM+ - +C VIA nano - + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') +define(`cnt', `%rax') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + rep;bsf t0, cnt C tzcnt! + + mov u0, s0 + sub v0, u0 + mov u1, s1 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovc t1, u1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + xor R32(t0), R32(t0) + sub cnt, t0 + shlx( t0, u1, s1) + shrx( cnt, u0, u0) + shrx( cnt, u1, u1) + or s1, u0 + + test v1, v1 + jnz L(top) + test u1, u1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + rep;bsf t0, cnt C tzcnt! + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): mov v0, %rax + C mov v1, %rdx +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm b/gcc/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm new file mode 100644 index 0000000..43abcc8 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm @@ -1,0 +1,174 @@ +dnl AMD64 mpn_cnd_add_n. + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 3.75 +C Intel SBR 1.93 +C Intel IBR 1.89 +C Intel HWL 1.78 +C Intel BWL 1.50 +C Intel SKL 1.50 +C Intel atom +C Intel SLM 4.0 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +define(ADDSUB, add) +define(ADCSBB, adc) +define(func, mpn_cnd_add_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_cnd_add_n) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + and cnd, %rdi + and cnd, %r9 + ADDSUB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + and cnd, %rdi + ADDSUB (up), %rdi + mov %rdi, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + and cnd, %r9 + and cnd, %r10 + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + ADCSBB (up), %rdi + mov %rdi, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + lea 32(up), up + mov %r11, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm b/gcc/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm new file mode 100644 index 0000000..f55492b 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm @@ -1,0 +1,200 @@ +dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2011-2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR 3.0 +C Intel NHM 2.75 +C Intel SBR 2.15 +C Intel IBR 1.96 +C Intel HWL 2.0 +C Intel BWL 1.65 +C Intel SKL 1.65 +C Intel atom +C Intel SLM 4.5 +C VIA nano + +C NOTES +C * It might seem natural to use the cmov insn here, but since this function +C is supposed to have the exact same execution pattern for cnd true and +C false, and since cmov's documentation is not clear about whether it +C actually reads both source operands and writes the register for a false +C condition, we cannot use it. +C * Given that we have a dedicated cnd_add_n, it might look strange that this +C file provides cnd_add_n and not just cnd_sub_n. But that's harmless, and +C this file's generality might come in handy for some pipeline. + +C INPUT PARAMETERS +define(`cnd_arg', `%rdi') dnl rcx +define(`rp', `%rsi') dnl rdx +define(`up', `%rdx') dnl r8 +define(`vp', `%rcx') dnl r9 +define(`n', `%r8') dnl rsp+40 + +define(`cnd', `%rbx') + +ifdef(`OPERATION_cnd_add_n',` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n',` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), R32(%r8)') + push %rbx + push %rbp + push %r12 + push %r13 + + neg cnd_arg + sbb cnd, cnd C make cnd mask + + test $1, R8(n) + jz L(x0) +L(x1): test $2, R8(n) + jz L(b1) + +L(b3): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + sub $3, n + jnz L(top) + jmp L(end) + +L(x0): xor R32(%rax), R32(%rax) + test $2, R8(n) + jz L(top) + +L(b2): mov (vp), %rdi + mov 8(vp), %r9 + mov (up), %r12 + and cnd, %rdi + mov 8(up), %r13 + and cnd, %r9 + ADDSUB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + sbb R32(%rax), R32(%rax) C save carry + lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + sub $2, n + jnz L(top) + jmp L(end) + +L(b1): mov (vp), %rdi + mov (up), %r12 + and cnd, %rdi + ADDSUB %rdi, %r12 + mov %r12, (rp) + sbb R32(%rax), R32(%rax) C save carry + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + dec n + jz L(end) + + ALIGN(16) +L(top): mov (vp), %rdi + mov 8(vp), %r9 + mov 16(vp), %r10 + mov 24(vp), %r11 + lea 32(vp), vp + and cnd, %rdi + mov (up), %r12 + and cnd, %r9 + mov 8(up), %r13 + and cnd, %r10 + mov 16(up), %rbp + and cnd, %r11 + add R32(%rax), R32(%rax) C restore carry + mov 24(up), %rax + lea 32(up), up + ADCSBB %rdi, %r12 + mov %r12, (rp) + ADCSBB %r9, %r13 + mov %r13, 8(rp) + ADCSBB %r10, %rbp + mov %rbp, 16(rp) + ADCSBB %r11, %rax + mov %rax, 24(rp) + lea 32(rp), rp + sbb R32(%rax), R32(%rax) C save carry + sub $4, n + jnz L(top) + +L(end): neg R32(%rax) + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/coreisbr/gcd_11.asm b/gcc/gmp/mpn/x86_64/coreisbr/gcd_11.asm new file mode 100644 index 0000000..4723093 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/coreisbr/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/fat/addmul_2.c b/gcc/gmp/mpn/x86_64/fat/addmul_2.c new file mode 100644 index 0000000..e0d7358 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/fat/addmul_2.c @@ -1,0 +1,38 @@ +/* Fat binary fallback mpn_addmul_2. + +Copyright 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, const mp_limb_t vp[2]) +{ + rp[n] = mpn_addmul_1 (rp, up, n, vp[0]); + return mpn_addmul_1 (rp + 1, up, n, vp[1]); +} diff --git a/gcc/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm b/gcc/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm new file mode 100644 index 0000000..06c5d5d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addlsh_n, mpn_rsblsh_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/k8/aorrlsh_n.asm') diff --git a/gcc/gmp/mpn/x86_64/goldmont/aors_n.asm b/gcc/gmp/mpn/x86_64/goldmont/aors_n.asm new file mode 100644 index 0000000..1818f9f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/aors_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/gcc/gmp/mpn/x86_64/goldmont/aorsmul_1.asm b/gcc/gmp/mpn/x86_64/goldmont/aorsmul_1.asm new file mode 100644 index 0000000..9c5f631 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/aorsmul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/goldmont/gmp-mparam.h b/gcc/gmp/mpn/x86_64/goldmont/gmp-mparam.h new file mode 100644 index 0000000..531521d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/gmp-mparam.h @@ -1,0 +1,264 @@ +/* Intel Goldmont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */ +/* FFT tuning limit = 468,030,122 */ +/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 3 +#define MOD_1_UNNORM_THRESHOLD 5 +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 38 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 17 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 19 + +#define DIV_1_VS_MUL_1_PERCENT 301 + +#define MUL_TOOM22_THRESHOLD 23 +#define MUL_TOOM33_THRESHOLD 65 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 258 +#define MUL_TOOM8H_THRESHOLD 357 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 113 +#define SQR_TOOM4_THRESHOLD 290 +#define SQR_TOOM6_THRESHOLD 351 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 16 + +#define MUL_FFT_MODF_THRESHOLD 440 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 440, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 21, 7}, { 11, 6}, \ + { 24, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,12}, { 95,11}, { 191,10}, \ + { 383, 9}, { 767,11}, { 207,10}, { 415,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 303,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 367,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 607,12}, \ + { 319,11}, { 639,12}, { 351,11}, { 703,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 543,11}, { 1087,12}, { 607,13}, { 319,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,13}, { 447,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1023,11}, { 2047,12}, { 1087,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,11}, { 2559,12}, { 1343,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1535,13}, { 831,12}, { 1727,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1279,12}, { 2559,13}, { 1343,12}, { 2687,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1727,14}, { 895,13}, { 1791,12}, { 3583,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2303,12}, { 4607,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,15}, { 767,14}, { 1535,13}, \ + { 3071,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,14}, { 1919,13}, { 3839,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,12}, { 15359,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4351,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,13}, { 15359,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8703,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7679,14}, \ + { 15359,15}, { 7935,17}, { 2047,16}, { 4095,15}, \ + { 8703,16}, { 4607,15}, { 9983,14}, { 19967,16}, \ + { 5119,15}, { 10239,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 261 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 380 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 380, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303, 9}, { 607,10}, { 319, 9}, \ + { 639,12}, { 95,11}, { 191,10}, { 383,11}, \ + { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,10}, { 607,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 415,12}, { 223,11}, { 479,13}, \ + { 127,12}, { 255,11}, { 543,12}, { 287,11}, \ + { 607,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 607,13}, { 319,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,13}, { 447,12}, { 895,11}, { 1791,14}, \ + { 255,13}, { 511,12}, { 1023,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 895,12}, { 1791,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1151,12}, { 2303,13}, { 1215,12}, \ + { 2431,14}, { 639,13}, { 1279,12}, { 2559,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,11}, { 5887,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1599,12}, { 3199,13}, \ + { 1663,14}, { 895,13}, { 1791,12}, { 3583,15}, \ + { 511,14}, { 1023,13}, { 2175,14}, { 1151,13}, \ + { 2303,12}, { 4607,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1791,13}, \ + { 3583,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3583,13}, { 7167,14}, { 3839,13}, { 7679,12}, \ + { 15359,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,14}, { 5119,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3071,14}, { 6143,15}, { 3327,14}, { 6911,15}, \ + { 3583,14}, { 7167,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8447,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 259 +#define SQR_FFT_THRESHOLD 3520 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 62 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 13 +#define SQRLO_SQR_THRESHOLD 7035 + +#define DC_DIV_QR_THRESHOLD 51 +#define DC_DIVAPPR_Q_THRESHOLD 183 +#define DC_BDIV_QR_THRESHOLD 47 +#define DC_BDIV_Q_THRESHOLD 88 + +#define INV_MULMOD_BNM1_THRESHOLD 46 +#define INV_NEWTON_THRESHOLD 226 +#define INV_APPR_THRESHOLD 204 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 28 +#define REDC_2_TO_REDC_N_THRESHOLD 54 + +#define MU_DIV_QR_THRESHOLD 1589 +#define MU_DIVAPPR_Q_THRESHOLD 1620 +#define MUPI_DIV_QR_THRESHOLD 83 +#define MU_BDIV_QR_THRESHOLD 1334 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 1,16,194,642 + +#define GET_STR_DC_THRESHOLD 10 +#define GET_STR_PRECOMPUTE_THRESHOLD 17 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1042 + +#define FAC_DSC_THRESHOLD 218 +#define FAC_ODD_THRESHOLD 25 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 1 /* 6.58% faster than 3 */ +#define HGCD_THRESHOLD 136 +#define HGCD_APPR_THRESHOLD 168 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 393 +#define JACOBI_BASE_METHOD 4 /* 1.17% faster than 3 */ + +/* Tuneup completed successfully, took 800192 seconds */ diff --git a/gcc/gmp/mpn/x86_64/goldmont/mul_1.asm b/gcc/gmp/mpn/x86_64/goldmont/mul_1.asm new file mode 100644 index 0000000..ed1ec54 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/mul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/coreisbr/mul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/goldmont/redc_1.asm b/gcc/gmp/mpn/x86_64/goldmont/redc_1.asm new file mode 100644 index 0000000..1192635 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/goldmont/redc_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Goldmont. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/k8/redc_1.asm') diff --git a/gcc/gmp/mpn/x86_64/k10/gcd_11.asm b/gcc/gmp/mpn/x86_64/k10/gcd_11.asm new file mode 100644 index 0000000..4723093 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/k10/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/k10/gcd_22.asm b/gcc/gmp/mpn/x86_64/k10/gcd_22.asm new file mode 100644 index 0000000..f58b4cc 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/k10/gcd_22.asm @@ -1,0 +1,142 @@ +dnl AMD64 mpn_gcd_22. Assumes useful bsf, useless shrd, no tzcnt, no shlx. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit +C AMD K8,K9 ? +C AMD K10 7.4 +C AMD bd1 9.9 +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD bt1 ? +C AMD bt2 ? +C AMD zn1 ? +C AMD zn2 ? +C Intel P4 ? +C Intel CNR ? +C Intel PNR ? +C Intel NHM 9.2 +C Intel WSM 9.0 +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C Intel GLM ? +C Intel GLM+ ? +C VIA nano ? + + +define(`u1', `%rdi') +define(`u0', `%rsi') +define(`v1', `%rdx') +define(`v0_param', `%rcx') + +define(`v0', `%rax') +define(`cnt', `%rcx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`t0', `%r10') +define(`t1', `%r11') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_22) + FUNC_ENTRY(4) + mov v0_param, v0 + + ALIGN(16) +L(top): mov v0, t0 + sub u0, t0 + jz L(lowz) C jump when low limb result = 0 + mov v1, t1 + sbb u1, t1 + + mov u0, s0 + mov u1, s1 + + bsf t0, cnt + + sub v0, u0 + sbb v1, u1 + +L(bck): cmovc t0, u0 C u = |u - v| + cmovnc u1, t1 C u = |u - v| + cmovc s0, v0 C v = min(u,v) + cmovc s1, v1 C v = min(u,v) + + shr R8(cnt), u0 + mov t1, u1 + shr R8(cnt), u1 + neg cnt + shl R8(cnt), t1 + or t1, u0 + + test u1, u1 + jnz L(top) + test v1, v1 + jnz L(top) + +L(gcd_11): + mov v0, %rdi +C mov u0, %rsi + TCALL( mpn_gcd_11) + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + mov v1, t0 + sub u1, t0 + je L(end) + + xor t1, t1 + mov u0, s0 + mov u1, s1 + bsf t0, cnt + mov u1, u0 + xor u1, u1 + sub v1, u0 + jmp L(bck) + +L(end): C mov v0, %rax + C mov v1, %rdx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/k8/addmul_2.asm b/gcc/gmp/mpn/x86_64/k8/addmul_2.asm new file mode 100644 index 0000000..78bcba1 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/k8/addmul_2.asm @@ -1,0 +1,195 @@ +dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl add the result to a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cfg cycles/limb am1+am1 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C AMD bull 5.2 <- 4.6-4.75 bad +C AMD pile 4.96 <- 4.6-4.75 bad +C AMD steam ? +C AMD excavator ? +C AMD bobcat 5.75 5.0 bad +C AMD jaguar 5.9 5.2-5.4 bad +C Intel P4 15-16 +C Intel core2 4.5 4.25-4.5 bad +C Intel NHM 4.33 4.55 bad +C Intel SBR 3.4 2.93 3.24 bad +C Intel IBR 3.35 2.6 2.95 bad +C Intel HWL 3.3 2.15 2.3 bad +C Intel BWL 2.33 2.33 1.65 bad +C Intel SKL 2.37 2.21 1.64 bad +C Intel atom 20 18.7 +C Intel SLM 8 8.5 +C VIA nano 4.4 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Tune feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + mov n_param, n + push %rbx + push %rbp + + mov 0(vp), v0 + mov 8(vp), v1 + + mov R32(n_param), R32(%rbx) + mov (up), %rax + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + mul v0 + neg n + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov 8(up,n,8), %rax + dec n + jmp L(lo3) + +L(b2): mov %rax, w2 + mov 8(up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + add $-2, n + jmp L(lo2) + +L(b1): mov %rax, w3 + mov 8(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b0): mov $0, R32(w3) + mov %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(lo0) + + ALIGN(32) +L(top): mov $0, R32(w1) + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(lo1): mul v1 + add w3, (rp,n,8) + mov $0, R32(w3) + adc %rax, w0 + mov $0, R32(w2) + mov 8(up,n,8), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov 8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mul v0 + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 16(up,n,8), %rax +L(lo3): mul v1 + add w1, 16(rp,n,8) + adc %rax, w2 + adc %rdx, w3 + xor R32(w0), R32(w0) + mov 24(up,n,8), %rax + mul v0 + add %rax, w2 + mov 24(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,n,8), %rax + add $4, n + js L(top) + +L(end): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/k8/bdiv_q_1.asm b/gcc/gmp/mpn/x86_64/k8/bdiv_q_1.asm new file mode 100644 index 0000000..1172b0d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/k8/bdiv_q_1.asm @@ -1,0 +1,179 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, +dnl returning quotient only. + +dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm/unorm +C AMD K8,K9 10 + +C AMD K10 10 + +C AMD bull 13.7 - +C AMD pile 13.7 + +C AMD steam +C AMD excavator +C AMD bobcat 15 - +C AMD jaguar 16 - +C Intel P4 33 = +C Intel core2 13.25 = +C Intel NHM 14 = +C Intel SBR 8.5 - +C Intel IBR 8.5 - +C Intel HWL 8 = +C Intel BWL 8 = +C Intel SKL 8 = +C Intel atom 42 -- +C Intel SLM 20.4 -- +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + mov 8(up), %rdx C up[1] + lea (up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx ncnt + C rdx + C r10 counter, limbs, negative + C r11 d + + mul %r11 C carry limb in rdx + mov (up,%r10,8), %rax + mov 8(up,%r10,8), %r9 + shrd R8(%rcx), %r9, %rax + nop + sub %rbx, %rax C apply carry bit + setc R8(%rbx) + sub %rdx, %rax C apply carry limb + adc $0, R32(%rbx) +L(ent): imul %r8, %rax + mov %rax, (rp,%r10,8) + inc %r10 + jnz L(top) + + mul %r11 C carry limb in rdx + mov (up), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/nano/gcd_11.asm b/gcc/gmp/mpn/x86_64/nano/gcd_11.asm new file mode 100644 index 0000000..4723093 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/nano/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/addmul_2.asm b/gcc/gmp/mpn/x86_64/pentium4/addmul_2.asm new file mode 100644 index 0000000..7ae6a1a 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/addmul_2.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addmul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_2) +include_mpn(`x86_64/bd1/addmul_2.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/aorsmul_1.asm b/gcc/gmp/mpn/x86_64/pentium4/aorsmul_1.asm new file mode 100644 index 0000000..e5dbb34 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/aorsmul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) +include_mpn(`x86_64/bd1/aorsmul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/mul_1.asm b/gcc/gmp/mpn/x86_64/pentium4/mul_1.asm new file mode 100644 index 0000000..70de670 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/mul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/mul_2.asm b/gcc/gmp/mpn/x86_64/pentium4/mul_2.asm new file mode 100644 index 0000000..a0f7302 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/mul_2.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_2 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_2) +include_mpn(`x86_64/bd1/mul_2.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/mul_basecase.asm b/gcc/gmp/mpn/x86_64/pentium4/mul_basecase.asm new file mode 100644 index 0000000..fb16029 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/mul_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/core2/mul_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/mullo_basecase.asm b/gcc/gmp/mpn/x86_64/pentium4/mullo_basecase.asm new file mode 100644 index 0000000..b9e08a8 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/mullo_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/core2/mullo_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/redc_1.asm b/gcc/gmp/mpn/x86_64/pentium4/redc_1.asm new file mode 100644 index 0000000..00e380d 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/redc_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_redc_1) +include_mpn(`x86_64/bt1/redc_1.asm') diff --git a/gcc/gmp/mpn/x86_64/pentium4/sqr_basecase.asm b/gcc/gmp/mpn/x86_64/pentium4/sqr_basecase.asm new file mode 100644 index 0000000..9725287 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/pentium4/sqr_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Nocona. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/core2/sqr_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm b/gcc/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm new file mode 100644 index 0000000..98c26cf 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm @@ -1,0 +1,50 @@ +dnl X86-64 mpn_addlsh1_n/mpn_rsblsh1_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) +define(RSH, 63) + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh1_n)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh1_n)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm b/gcc/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm new file mode 100644 index 0000000..2a83217 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm @@ -1,0 +1,50 @@ +dnl X86-64 mpn_addlsh2_n/mpn_rsblsh2_n optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) +define(RSH, 62) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func, mpn_addlsh2_n)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func, mpn_rsblsh2_n)') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +include_mpn(`x86_64/aorrlshC_n.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/aors_n.asm b/gcc/gmp/mpn/x86_64/silvermont/aors_n.asm new file mode 100644 index 0000000..dce3d75 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/aors_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreisbr/aors_n.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/aorsmul_1.asm b/gcc/gmp/mpn/x86_64/silvermont/aorsmul_1.asm new file mode 100644 index 0000000..ead0d76 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/aorsmul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addmul_1/mpn_submul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) +include_mpn(`x86_64/core2/aorsmul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/gmp-mparam.h b/gcc/gmp/mpn/x86_64/silvermont/gmp-mparam.h new file mode 100644 index 0000000..f8cb0f4 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/gmp-mparam.h @@ -1,0 +1,252 @@ +/* Intel Silvermont gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */ +/* FFT tuning limit = 468153400 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 55 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD MP_SIZE_T_MAX /* never */ + +#define DIV_1_VS_MUL_1_PERCENT 168 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 66 +#define MUL_TOOM44_THRESHOLD 152 +#define MUL_TOOM6H_THRESHOLD 222 +#define MUL_TOOM8H_THRESHOLD 333 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 105 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 88 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 232 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 24 + +#define MULMOD_BNM1_THRESHOLD 13 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 17, 7}, { 9, 6}, { 20, 7}, { 11, 6}, \ + { 23, 7}, { 17, 8}, { 9, 7}, { 21, 8}, \ + { 11, 7}, { 23, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 39, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47, 9}, { 95,10}, { 55,11}, \ + { 31,10}, { 79,11}, { 47,10}, { 95,12}, \ + { 31,11}, { 63,10}, { 127, 9}, { 255,10}, \ + { 135,11}, { 79, 9}, { 319,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207, 9}, { 415,11}, \ + { 111,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271,11}, { 143,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 319,10}, { 639,11}, { 351,10}, \ + { 703, 9}, { 1407,12}, { 191,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 479,13}, { 127,12}, \ + { 255,11}, { 543,12}, { 287,11}, { 575,10}, \ + { 1151,12}, { 319,11}, { 639,12}, { 351,11}, \ + { 703,10}, { 1407,13}, { 191,12}, { 415,11}, \ + { 831,10}, { 1663,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 543,11}, { 1087,10}, { 2175,12}, \ + { 575,11}, { 1151,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,11}, { 2431,10}, { 4863,13}, \ + { 639,12}, { 1279,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 831,12}, { 1663,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1087,12}, { 2175,13}, \ + { 1215,12}, { 2431,11}, { 4863,14}, { 639,13}, \ + { 1407,12}, { 2815,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2559,14}, { 1407,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,16}, { 1023,15}, \ + { 2047,14}, { 4223,15}, { 2303,14}, { 4863,15}, \ + { 2815,14}, { 5887,13}, { 11775,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,17}, { 2047,16}, \ + { 4607,15}, { 9215,16}, { 5631,15}, { 11775,17}, \ + { 3071,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 225 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 308 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 308, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 17, 8}, \ + { 9, 7}, { 21, 8}, { 11, 7}, { 23, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 33, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 95,12}, { 31,11}, { 63,10}, { 127, 9}, \ + { 255, 8}, { 511, 9}, { 271, 8}, { 543,11}, \ + { 79,10}, { 159, 9}, { 319, 8}, { 639,10}, \ + { 175,11}, { 95,10}, { 191, 9}, { 383,10}, \ + { 207, 9}, { 415,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \ + { 319, 9}, { 639,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 479,12}, { 255,11}, { 543,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,11}, { 703,10}, { 1407,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 479,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1279,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1407,12}, { 2815,13}, \ + { 1471,12}, { 2943,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,15}, \ + { 767,14}, { 1663,13}, { 3455,12}, { 6911,14}, \ + { 1791,13}, { 3583,16}, { 511,15}, { 1023,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4223,15}, { 2303,14}, \ + { 4863,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3071,14}, { 6143,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,17}, { 2047,16}, { 4607,15}, \ + { 9983,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 232 +#define SQR_FFT_THRESHOLD 2752 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 55 +#define MULLO_MUL_N_THRESHOLD 6633 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 5397 + +#define DC_DIV_QR_THRESHOLD 33 +#define DC_DIVAPPR_Q_THRESHOLD 222 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 147 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 222 +#define INV_APPR_THRESHOLD 222 + +#define BINV_NEWTON_THRESHOLD 212 +#define REDC_1_TO_REDC_2_THRESHOLD 55 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 81 +#define MU_BDIV_QR_THRESHOLD 942 +#define MU_BDIV_Q_THRESHOLD 1043 + +#define POWM_SEC_TABLE 1,34,102,588,1730 + +#define GET_STR_DC_THRESHOLD 17 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1659 + +#define FAC_DSC_THRESHOLD 351 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD2_DIV1_METHOD 3 /* 3.06% faster than 1 */ +#define HGCD_THRESHOLD 120 +#define HGCD_APPR_THRESHOLD 153 +#define HGCD_REDUCE_THRESHOLD 2121 +#define GCD_DC_THRESHOLD 416 +#define GCDEXT_DC_THRESHOLD 309 +#define JACOBI_BASE_METHOD 1 /* 2.28% faster than 3 */ + +/* Tuneup completed successfully, took 938046 seconds */ diff --git a/gcc/gmp/mpn/x86_64/silvermont/hamdist.asm b/gcc/gmp/mpn/x86_64/silvermont/hamdist.asm new file mode 100644 index 0000000..848ed01 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/hamdist.asm @@ -1,0 +1,38 @@ +dnl x86-64 mpn_hamdist. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/lshift.asm b/gcc/gmp/mpn/x86_64/silvermont/lshift.asm new file mode 100644 index 0000000..acd3180 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/lshift.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/lshiftc.asm b/gcc/gmp/mpn/x86_64/silvermont/lshiftc.asm new file mode 100644 index 0000000..3a68bb5 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/lshiftc.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/mul_1.asm b/gcc/gmp/mpn/x86_64/silvermont/mul_1.asm new file mode 100644 index 0000000..c1e1c94 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/mul_1.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_1 optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c) +include_mpn(`x86_64/bd1/mul_1.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/mul_basecase.asm b/gcc/gmp/mpn/x86_64/silvermont/mul_basecase.asm new file mode 100644 index 0000000..6228c48 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/mul_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mul_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mul_basecase) +include_mpn(`x86_64/k8/mul_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/mullo_basecase.asm b/gcc/gmp/mpn/x86_64/silvermont/mullo_basecase.asm new file mode 100644 index 0000000..0244f8a 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/mullo_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_mullo_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_mullo_basecase) +include_mpn(`x86_64/k8/mullo_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/popcount.asm b/gcc/gmp/mpn/x86_64/silvermont/popcount.asm new file mode 100644 index 0000000..73eb7b5 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/popcount.asm @@ -1,0 +1,38 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/rshift.asm b/gcc/gmp/mpn/x86_64/silvermont/rshift.asm new file mode 100644 index 0000000..b84371c 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/rshift.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/silvermont/sqr_basecase.asm b/gcc/gmp/mpn/x86_64/silvermont/sqr_basecase.asm new file mode 100644 index 0000000..afccf93 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/silvermont/sqr_basecase.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_sqr_basecase optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sqr_basecase) +include_mpn(`x86_64/k8/sqr_basecase.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/aorrlsh1_n.asm b/gcc/gmp/mpn/x86_64/zen/aorrlsh1_n.asm new file mode 100644 index 0000000..803fa30 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/aorrlsh1_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/aorrlsh_n.asm b/gcc/gmp/mpn/x86_64/zen/aorrlsh_n.asm new file mode 100644 index 0000000..e049b2f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/aorrlsh_n.asm @@ -1,0 +1,226 @@ +dnl AMD64 mpn_addlsh_n, mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 n/a +C AMD bd3 n/a +C AMD bd4 2.31 +C AMD zen 1.69 +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL 2.08 +C Intel BWL 1.78 +C Intel SKL 1.78 +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * The loop sustains 4 insns/cycle on zen. +C * Perhaps avoid using jrcxz by using dec n + jnz. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +define(`tnc', `%r9') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + mov (vp), %r10 + + mov R32(n), R32(%rax) + shr $3, n + xor R32(tnc), R32(tnc) + sub cnt, tnc + and $7, R32(%rax) + + lea L(tab)(%rip), %r11 +ifdef(`PIC',` + movslq (%r11,%rax,4), %rax + add %r11, %rax + jmp *%rax +',` + jmp *(%r11,%rax,8) +') + +L(0): lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e0) + +L(7): mov %r10, %r11 + lea 24(up), up + lea 24(vp), vp + lea 24(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e7) + +L(6): lea 16(up), up + lea 16(vp), vp + lea 16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e6) + +L(5): mov %r10, %r11 + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e5) + +L(end): ADCSBB 24(up), %rax + mov %rax, -40(rp) + shrx( tnc, %r11, %rax) + ADCSBB n, %rax + FUNC_EXIT() + ret + + ALIGN(32) +L(top): jrcxz L(end) + mov -32(vp), %r10 + ADCSBB 24(up), %rax + lea 64(up), up + shrx( tnc, %r11, %r11) + mov %rax, -40(rp) +L(e0): dec n + shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -24(vp), %r11 + ADCSBB -32(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -32(rp) +L(e7): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov -16(vp), %r10 + ADCSBB -24(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -24(rp) +L(e6): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov -8(vp), %r11 + ADCSBB -16(up), %rax + shrx( tnc, %r10, %r10) + mov %rax, -16(rp) +L(e5): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov (vp), %r10 + ADCSBB -8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, -8(rp) +L(e4): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 8(vp), %r11 + ADCSBB (up), %rax + shrx( tnc, %r10, %r10) + mov %rax, (rp) +L(e3): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + mov 16(vp), %r10 + ADCSBB 8(up), %rax + shrx( tnc, %r11, %r11) + mov %rax, 8(rp) +L(e2): shlx( cnt, %r10, %rax) + lea (%r11,%rax), %rax + mov 24(vp), %r11 + ADCSBB 16(up), %rax + lea 64(vp), vp + shrx( tnc, %r10, %r10) + mov %rax, 16(rp) + lea 64(rp), rp +L(e1): shlx( cnt, %r11, %rax) + lea (%r10,%rax), %rax + jmp L(top) + +L(4): xor R32(%r11), R32(%r11) + jmp L(e4) + +L(3): mov %r10, %r11 + lea -8(up), up + lea -8(vp), vp + lea -8(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e3) + +L(2): lea -16(up), up + lea -16(vp), vp + lea -16(rp), rp + xor R32(%r11), R32(%r11) + jmp L(e2) + +L(1): mov %r10, %r11 + lea -24(up), up + lea 40(vp), vp + lea 40(rp), rp + xor R32(%r10), R32(%r10) + jmp L(e1) +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) diff --git a/gcc/gmp/mpn/x86_64/zen/aorsmul_1.asm b/gcc/gmp/mpn/x86_64/zen/aorsmul_1.asm new file mode 100644 index 0000000..89795e3 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/aorsmul_1.asm @@ -1,0 +1,165 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.3 +C AMD zen 2 +C AMD bt1 - +C AMD bt2 - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`ADCSBB', `adc') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`ADCSBB', `sbb') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + mov (up), %r8 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo1) + +L(b0): mulx( %r8, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + xor R32(%rax), R32(%rax) + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + sub $-3, n + jz L(wd3) + test R32(%rax), R32(%rax) C clear cy + jmp L(lo3) + +L(b2): mulx( %r8, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + test R32(%rax), R32(%rax) C clear cy + jmp L(lo2) + +L(top): ADDSUB %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + ADCSBB %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + ADCSBB %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + ADCSBB %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax C rax = carry limb + add $4, n + js L(top) + +L(end): ADDSUB %r9, (rp) +L(wd3): ADCSBB %r11, 8(rp) +L(wd2): ADCSBB %r13, 16(rp) +L(wd1): ADCSBB %rbx, 24(rp) + adc n, %rax + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/x86_64/zen/com.asm b/gcc/gmp/mpn/x86_64/zen/com.asm new file mode 100644 index 0000000..b34f841 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/com.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/copyd.asm b/gcc/gmp/mpn/x86_64/zen/copyd.asm new file mode 100644 index 0000000..63ed237 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/copyd.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/copyi.asm b/gcc/gmp/mpn/x86_64/zen/copyi.asm new file mode 100644 index 0000000..1aafaaa 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/copyi.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/gcd_11.asm b/gcc/gmp/mpn/x86_64/zen/gcd_11.asm new file mode 100644 index 0000000..0ffb6ca 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/gcd_11.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/bd2/gcd_11.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/gcd_22.asm b/gcc/gmp/mpn/x86_64/zen/gcd_22.asm new file mode 100644 index 0000000..5dfd9e3 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/gcd_22.asm @@ -1,0 +1,37 @@ +dnl AMD64 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_22) +include_mpn(`x86_64/coreihwl/gcd_22.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/gmp-mparam.h b/gcc/gmp/mpn/x86_64/zen/gmp-mparam.h new file mode 100644 index 0000000..05a12b3 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/gmp-mparam.h @@ -1,0 +1,280 @@ +/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3700-4300 MHz Pinnacle Ridge */ +/* FFT tuning limit = 468,514,360 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 18 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 32 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 338 + +#define MUL_TOOM22_THRESHOLD 16 +#define MUL_TOOM33_THRESHOLD 107 +#define MUL_TOOM44_THRESHOLD 190 +#define MUL_TOOM6H_THRESHOLD 230 +#define MUL_TOOM8H_THRESHOLD 272 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 110 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 106 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 117 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 422 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 40 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 540, 5}, { 22, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 9}, { 11, 8}, { 29, 9}, \ + { 15, 8}, { 35, 9}, { 19, 8}, { 43, 9}, \ + { 23, 8}, { 49, 9}, { 27,10}, { 15, 9}, \ + { 31, 8}, { 63, 9}, { 43,10}, { 23, 9}, \ + { 55,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55,11}, { 31,10}, { 79,11}, { 47,10}, \ + { 103,12}, { 31,11}, { 63,10}, { 135,11}, \ + { 79,10}, { 167,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 159,12}, { 95,11}, { 191,13}, \ + { 63,12}, { 127,11}, { 255,10}, { 511,11}, \ + { 271,10}, { 543,11}, { 287,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 767,11}, { 415,10}, { 831,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343, 9}, { 2687,12}, \ + { 351,11}, { 703,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,10}, \ + { 2687,12}, { 703,11}, { 1407,13}, { 383,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 927,11}, \ + { 1855,12}, { 959,11}, { 1919,10}, { 3839,13}, \ + { 511,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \ + { 2687,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 895,12}, { 1855,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2175,13}, { 4479,14}, \ + { 2431,13}, { 4863,15}, { 1279,14}, { 2687,13}, \ + { 5375,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4479,15}, \ + { 2303,14}, { 4991,15}, { 2559,14}, { 5247,15}, \ + { 2815,14}, { 5887,16}, { 1535,15}, { 3327,14}, \ + { 6911,15}, { 3839,14}, { 7679,17}, { 1023,16}, \ + { 2047,15}, { 4095,14}, { 8191,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5375,14}, \ + { 11007,15}, { 5887,14}, { 11775,16}, { 3071,15}, \ + { 6911,16}, { 3583,15}, { 7167,14}, { 14335,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9215,14}, { 18431,15}, { 9727,14}, { 19455,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 11007,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 271 +#define MUL_FFT_THRESHOLD 6272 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 13, 4}, { 27, 5}, { 21, 6}, \ + { 11, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 14, 5}, { 29, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 25, 8}, \ + { 13, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 29, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 43,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 399,10}, { 799,11}, { 415,10}, { 831,12}, \ + { 223,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,11}, \ + { 799,12}, { 415,11}, { 831,10}, { 1663,12}, \ + { 447,11}, { 895,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,11}, { 1215,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1471,11}, { 2943,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,12}, { 1855,13}, { 959,15}, { 255,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1855,12}, { 3711,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,16}, \ + { 511,15}, { 1023,14}, { 2047,13}, { 4095,14}, \ + { 2175,13}, { 4479,12}, { 8959,14}, { 2431,13}, \ + { 4863,15}, { 1279,14}, { 2943,13}, { 5887,12}, \ + { 11775,15}, { 1535,14}, { 3455,13}, { 6911,15}, \ + { 1791,14}, { 3839,13}, { 7679,14}, { 3967,16}, \ + { 1023,15}, { 2047,14}, { 4479,15}, { 2303,14}, \ + { 4991,15}, { 2559,14}, { 5247,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4095,14}, \ + { 8191,15}, { 4351,14}, { 8959,15}, { 4863,14}, \ + { 9727,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 6911,16}, { 3583,15}, { 7167,14}, \ + { 14335,15}, { 7679,14}, { 15359,15}, { 7935,14}, \ + { 15871,17}, { 2047,16}, { 4095,15}, { 8959,16}, \ + { 4607,15}, { 9215,14}, { 18431,15}, { 9727,14}, \ + { 19455,15}, { 9983,14}, { 19967,16}, { 5119,15}, \ + { 10239,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 6655,15}, { 13311,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 302 +#define SQR_FFT_THRESHOLD 4224 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 69 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 12 +#define SQRLO_DC_THRESHOLD 82 +#define SQRLO_SQR_THRESHOLD 8207 + +#define DC_DIV_QR_THRESHOLD 76 +#define DC_DIVAPPR_Q_THRESHOLD 232 +#define DC_BDIV_QR_THRESHOLD 76 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 37 +#define INV_NEWTON_THRESHOLD 274 +#define INV_APPR_THRESHOLD 230 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_N_THRESHOLD 68 + +#define MU_DIV_QR_THRESHOLD 1499 +#define MU_DIVAPPR_Q_THRESHOLD 1718 +#define MUPI_DIV_QR_THRESHOLD 108 +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1787 + +#define POWM_SEC_TABLE 3,22,81,494 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 486 +#define SET_STR_PRECOMPUTE_THRESHOLD 1264 + +#define FAC_DSC_THRESHOLD 187 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 23 +#define HGCD2_DIV1_METHOD 1 /* 9.20% faster than 3 */ +#define HGCD_THRESHOLD 109 +#define HGCD_APPR_THRESHOLD 104 +#define HGCD_REDUCE_THRESHOLD 3014 +#define GCD_DC_THRESHOLD 566 +#define GCDEXT_DC_THRESHOLD 382 +#define JACOBI_BASE_METHOD 1 /* 15.55% faster than 3 */ + +/* Tuneup completed successfully, took 281243 seconds */ diff --git a/gcc/gmp/mpn/x86_64/zen/hamdist.asm b/gcc/gmp/mpn/x86_64/zen/hamdist.asm new file mode 100644 index 0000000..48dcf61 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/hamdist.asm @@ -1,0 +1,38 @@ +dnl AMD64 mpn_hamdist -- hamming distance. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/coreinhm/hamdist.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/lshift.asm b/gcc/gmp/mpn/x86_64/zen/lshift.asm new file mode 100644 index 0000000..4dce319 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/lshift.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_lshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshift) +include_mpn(`x86_64/fastsse/lshift-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/lshiftc.asm b/gcc/gmp/mpn/x86_64/zen/lshiftc.asm new file mode 100644 index 0000000..d52b194 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/lshiftc.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_lshiftc optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_lshiftc) +include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/mul_1.asm b/gcc/gmp/mpn/x86_64/zen/mul_1.asm new file mode 100644 index 0000000..6a083ac 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/mul_1.asm @@ -1,0 +1,161 @@ +dnl AMD64 mpn_mul_1 for CPUs with mulx. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 - +C AMD K10 - +C AMD bd1 - +C AMD bd2 - +C AMD bd3 - +C AMD bd4 4.4 +C AMD zen 2 +C AMD bobcat - +C AMD jaguar - +C Intel P4 - +C Intel PNR - +C Intel NHM - +C Intel SBR - +C Intel IBR - +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom - +C Intel SLM - +C VIA nano - + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0_param',`%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(ent) +EPILOGUE() + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor R32(%r8), R32(%r8) C carry-in limb +L(ent): mov (up), %r9 + + push %rbx + push %r12 + push %r13 + + lea (up,n_param,8), up + lea -32(rp,n_param,8), rp + mov R32(n_param), R32(%rax) + xchg v0_param, v0 C FIXME: is this insn fast? + + neg n + + and $3, R8(%rax) + jz L(b0) + cmp $2, R8(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov %r8, %r12 + mulx( %r9, %rbx, %rax) + sub $-1, n + jz L(wd1) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r12, %rbx + jmp L(lo1) + +L(b3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x08 C mulx 8(up,n,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10 C mulx 16(up,n,8), %rbx, %rax + sub $-3, n + jz L(wd3) + add %r8, %r11 + jmp L(lo3) + +L(b2): mov %r8, %r10 C carry-in limb + mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08 C mulx 8(up,n,8), %rbx, %rax + sub $-2, n + jz L(wd2) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r10, %r13 + jmp L(lo2) + +L(b0): mov %r8, %rax C carry-in limb + mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rax, %r9 + jmp L(lo0) + +L(top): jrcxz L(end) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(top) + +L(end): mov %r9, (rp) +L(wd3): adc %r8, %r11 + mov %r11, 8(rp) +L(wd2): adc %r10, %r13 + mov %r13, 16(rp) +L(wd1): adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + + pop %r13 + pop %r12 + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/x86_64/zen/mul_basecase.asm b/gcc/gmp/mpn/x86_64/zen/mul_basecase.asm new file mode 100644 index 0000000..affa3b6 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/mul_basecase.asm @@ -1,0 +1,455 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Try 2x unrolling instead of current 4x, at least for mul_1. Else consider +C shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead +C of 8 product registers. +C * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops +C without branch tree. +C * Improve the overlapped software pipelining. The mulx in the osp block now +C suffers from write/read conflicts, in particular the 1 mod 4 case. Also, +C mul_1 could osp into addmul_1. +C * Let vn_param be vn to save a copy. +C * Re-allocate to benefit more from 32-bit encoding. +C * Poor performance for e.g. n = 12,16. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param', `%rdx') +define(`vp_param', `%rcx') +define(`vn_param', `%r8') + +define(`un', `%r14') +define(`vp', `%rbp') +define(`v0', `%rdx') +define(`n', `%rcx') +define(`vn', `%r15') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + + cmp $2, un_param + ja L(gen) + mov (vp_param), %rdx + mulx( (up), %rax, %r9) C 0 1 + je L(s2x) + +L(s11): mov %rax, (rp) + mov %r9, 8(rp) + FUNC_EXIT() + ret + +L(s2x): cmp $2, vn_param + mulx( 8,(up), %r8, %r10) C 1 2 + je L(s22) + +L(s21): add %r8, %r9 + adc $0, %r10 + mov %rax, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + FUNC_EXIT() + ret + +L(s22): add %r8, %r9 C 1 + adc $0, %r10 C 2 + mov 8(vp_param), %rdx + mov %rax, (rp) + mulx( (up), %r8, %r11) C 1 2 + mulx( 8,(up), %rax, %rdx) C 2 3 + add %r11, %rax C 2 + adc $0, %rdx C 3 + add %r8, %r9 C 1 + adc %rax, %r10 C 2 + adc $0, %rdx C 3 + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + + +L(gen): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov un_param, un + mov vp_param, vp + mov vn_param, vn + + mov (up), %r9 + mov (vp), v0 + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + mov un, n + test $1, R8(un) + jz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + inc n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + sub $-3, n + jz L(mwd3) + test R32(%rdx), R32(%rdx) + jmp L(mlo3) + +L(mx0): test $2, R8(un) + jz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(n), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + jmp L(mlo0) + +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 +L(mwd3):mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + add $8, vp + dec vn + jz L(end) + +C The rest of the file are 4 osp loops around addmul_1 + + test $1, R8(un) + jnz L(0x1) + +L(0x0): test $2, R8(un) + jnz L(oloop2_entry) + +L(oloop0_entry): + C initial feed-in block + mov (vp), %rdx + add $8, vp + mov un, n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r8, %r11 + jmp L(lo0) + +L(oloop0): + C overlapped software pipelining block + mov (vp), %rdx C new + add $8, vp + add %r9, (rp) C prev + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + adc %r11, 8(rp) C prev + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 0x8(%rsi,%r14,8),%r11,%r10 + adc %r13, 16(rp) C prev + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 0x10(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) C prev + mov un, n + adc $0, %rax C prev + mov %rax, 32(rp) C prev + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%rbx,%rax + add %r8, %r11 C new + jmp L(lo0) + + ALIGN(16) +L(tp0): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 +L(lo0): adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp0) + + dec vn + jne L(oloop0) + + jmp L(final_wind_down) + +L(oloop2_entry): + mov (vp), %rdx + add $8, vp + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + +L(oloop2): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %rbx, %rax + lea 2(un), n + add $8, rp + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %r9, %r8 + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r11,%r10 + add %r13, 16(rp,n,8) + jmp L(lo2) + + ALIGN(16) +L(tp2): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo2): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp2) + + dec vn + jne L(oloop2) + + jmp L(final_wind_down) + +L(0x1): test $2, R8(un) + jz L(oloop3_entry) + +L(oloop1_entry): + mov (vp), %rdx + add $8, vp + lea 1(un), n + add $8, rp + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + add %rbx, 24(rp,n,8) + jmp L(lo1) + +L(oloop1): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(up,un,8), %r9, %r8 + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18 C mulx 0x18(%rsi,%r14,8),%r13,%r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (up,un,8), %rbx, %rax + lea 1(un), n + add $8, rp + add %rbx, 24(rp,n,8) + jmp L(lo1) + + ALIGN(16) +L(tp1): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) +L(lo1): adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp1) + + dec vn + jne L(oloop1) + + jmp L(final_wind_down) + +L(oloop3_entry): + mov (vp), %rdx + add $8, vp + lea 3(un), n + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + test n, n + jz L(wd3) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + add %r11, 8(rp,n,8) + jmp L(lo3) + +L(oloop3): + mov (vp), %rdx + add $8, vp + add %r9, (rp) + adc %r11, 8(rp) + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + adc %r13, 16(rp) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + lea 3(un), n + add $8, rp + add %r10, %r13 + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r12, %rbx + adc $0, %rax + add %r11, 8(rp,n,8) + jmp L(lo3) + + ALIGN(16) +L(tp3): add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo3): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(tp3) + + dec vn + jne L(oloop3) + +L(final_wind_down): + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(end): pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(3): mov (vp), %rdx + add $8, vp + add $8, rp + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (up,un,8), %r11, %r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(up,un,8), %r13, %r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(up,un,8), %rbx, %rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax +L(wd3): adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + dec vn + jne L(3) + jmp L(end) +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/zen/mullo_basecase.asm b/gcc/gmp/mpn/x86_64/zen/mullo_basecase.asm new file mode 100644 index 0000000..2ae729a 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/mullo_basecase.asm @@ -1,0 +1,299 @@ +dnl X64-64 mpn_mullo_basecase optimised for AMD Zen. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`nn', `%rbp') + +C TODO +C * Rearrange feed-in jumps for short branch forms. +C * Roll out the heavy artillery and 4-way unroll outer loop. Since feed-in +C code implodes, the blow-up will not be more than perhaps 2.5x. +C * Micro-optimise critical lead-in code blocks. +C * Clean up register use, e.g. r15 vs vp, disuse of nn, etc. +C * Write n < 4 code specifically for Zen (current code is for Haswell). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, R32(n) + jae L(big) + + mov vp_param, vp + mov (up), %rdx + + cmp $2, R32(n) + jae L(gt1) +L(n1): imul (vp), %rdx + mov %rdx, (rp) + FUNC_EXIT() + ret +L(gt1): ja L(gt2) +L(n2): mov (vp), %r9 + mulx( %r9, %rax, %rdx) + mov %rax, (rp) + mov 8(up), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(vp), %r9 + mov (up), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(rp) + FUNC_EXIT() + ret +L(gt2): +L(n3): mov (vp), %r9 + mulx( %r9, %rax, %r10) C u0 x v0 + mov %rax, (rp) + mov 8(up), %rdx + mulx( %r9, %rax, %rdx) C u1 x v0 + imul 16(up), %r9 C u2 x v0 + add %rax, %r10 + adc %rdx, %r9 + mov 8(vp), %r8 + mov (up), %rdx + mulx( %r8, %rax, %rdx) C u0 x v1 + add %rax, %r10 + adc %rdx, %r9 + imul 8(up), %r8 C u1 x v1 + add %r8, %r9 + mov %r10, 8(rp) + mov 16(vp), %r10 + mov (up), %rax + imul %rax, %r10 C u0 x v2 + add %r10, %r9 + mov %r9, 16(rp) + FUNC_EXIT() + ret + + ALIGN(16) +L(big): push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov (up), %r9 + lea -8(up,n,8), up + lea -40(rp,n,8), rp + + mov $4, R32(%r14) + sub n, %r14 + mov -8(vp_param,n,8), %rbp + imul %r9, %rbp + lea 8(vp_param), %r15 + mov (vp_param), %rdx + + test $1, R8(%r14) + jnz L(mx0) +L(mx1): test $2, R8(%r14) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + lea -2(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + lea (%r14), n + jrcxz L(x) + jmp L(mlo3) +L(x): jmp L(mcor) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%rbx,%rax + lea -1(%r14), n + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r9,%r8 + jmp L(mlo2) + +L(mx0): test $2, R8(%r14) + jz L(mb2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0 C mulx -0x10(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + lea -3(%r14), n + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + mov %rbx, 24(rp) + +L(outer): + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + add $8, %r15 + mov -24(up,%r14,8), %r8 + lea -8(up), up + + test $1, R8(%r14) + jz L(x0) +L(x1): test $2, R8(%r14) + jnz L(b3) + +L(b1): mulx( %r8, %rbx, %rax) + lea -1(%r14), n + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (%rsi,%rcx,8),%r9,%r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 0x8(%rsi,%rcx,8),%r11,%r10 + jmp L(lo1) + +L(x0): test $2, R8(%r14) + jz L(b2) + +L(b0): mulx( %r8, %r9, %r8) + lea -2(%r14), n + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (%rsi,%r14,8),%r13,%r12 + jmp L(lo0) + +L(b3): mulx( %r8, %r11, %r10) + lea 1(%r14), n + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(cor) + jmp L(lo3) + +L(cor): add 8(rp), %r11 + mov 16(rp), %r10 + mov 24(rp), %r12 +L(mcor):mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + mulx( (up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov (%r15), %rdx + mov -24(up), %r8 + mulx( %r8, %r9, %r12) + mulx( -16,(up), %r14, %rax) + add %r12, %r14 + adc $0, %rax + adc %r9, %r13 + mov %r13, 16(rp) + adc %r14, %rbx + mulx( -8,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov 8(%r15), %rdx + mulx( -24,(up), %r14, %rax) + add %r14, %rbx + mov %rbx, 24(rp) + mulx( -16,(up), %r10, %r8) C FIXME r8 unused (use imul?) + adc %rax, %rbp + add %r10, %rbp + mov %rbp, 32(rp) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(b2): mulx( %r8, %r13, %r12) + lea (%r14), n + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8 C mulx -0x8(%rsi,%r14,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (%rsi,%r14,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top): add %r9, (rp,n,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + js L(top) + + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + inc %r14 + jmp L(outer) +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/zen/popcount.asm b/gcc/gmp/mpn/x86_64/zen/popcount.asm new file mode 100644 index 0000000..be1613b 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/popcount.asm @@ -1,0 +1,38 @@ +dnl AMD64 mpn_popcount -- population count. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/coreinhm/popcount.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/rshift.asm b/gcc/gmp/mpn/x86_64/zen/rshift.asm new file mode 100644 index 0000000..0196870 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/rshift.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_rshift optimised for AMD Zen. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rshift) +include_mpn(`x86_64/fastsse/rshift-movdqu2.asm') diff --git a/gcc/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/gcc/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm new file mode 100644 index 0000000..f6e8f9c 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm @@ -1,0 +1,507 @@ +dnl AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +define(`up', `%rdi') +define(`un_param', `%rsi') +define(`dp_param', `%rdx') +define(`dn_param', `%rcx') +define(`dinv', `%r8') + +define(`i', `%rcx') +define(`dn', `%r14') + +define(`dp', `%rsi') +define(`un', `%r15') + +C TODO +C * The o1...o8 loops for special dn counts were naively hand-optimised by +C folding the generic loops. They can probably be tuned. The speculative +C quotient limb generation might not be in the optimal spot. +C * Perhaps avoid late-in-loop jumps, e.g., lo0. +C * Improve regalloc wrt dn_param/dn and un_param/un to save some moves. + +C ABI_SUPPORT(DOS64) +C ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sbpi1_bdiv_r) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), dinv ') + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + sub dn_param, un_param C outer loop count + mov dn_param, dn C FIXME: Suppress by reg re-alloc + push dinv C keep dinv on stack + mov un_param, un C FIXME: Suppress by reg re-alloc + xor R32(%rbp), R32(%rbp) + + lea (dp_param,dn_param,8), dp + + mov (up), %rdx + imul dinv, %rdx C first quotient limb + + neg dn + lea -32(up,dn_param,8), up + + test $1, R8(dn_param) + jnz L(cx1) + +L(cx0): test $2, R8(dn_param) + jnz L(b2) + + +C ============================================================================= +L(b0): cmp $-4, dn + jnz L(gt4) + +L(o4): mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + adc 8(up), %r11 + mov %r8, %rdx C dinv + mov %r11, 8(up) + mulx( %r11, %rdx, %r12) C next quotient + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o4) + jmp L(ret) + +L(gt4): cmp $-8, dn + jnz L(out0) + +L(o8): mulx( -64,(dp), %r9, %r14) + mulx( -56,(dp), %rcx, %r10) + mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r14, %rcx + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add -32(up), %r9 + mulx( -32,(dp), %r9, %r14) + adc -24(up), %rcx + mov %rcx, -24(up) + mulx( -24,(dp), %r11, %r10) + adc %r13, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o8) + jmp L(ret) + +L(out0):mov dn, i + .byte 0xc4,0x22,0xb3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08 C mulx 8(dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10 C mulx 16(dp,dn,8),%r13,%r12 + clc + jmp L(lo0) + + ALIGN(16) +L(top0):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 +L(lo0): .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top0) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(%rdi,%r14,8),%rdx,%r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out0) + jmp L(ret) + +L(cx1): test $2, R8(dn_param) + jnz L(b3) + +C ============================================================================= +L(b1): cmp $-1, dn + jnz L(gt1) + + mov 24(up), %r9 +L(o1): mulx( -8,(dp), %rbx, %rdx) + add %r9, %rbx + adc %rbp, %rdx + add 32(up), %rdx + setc R8(%rbp) + mov %rdx, %r9 + mulx( %r8, %rdx, %r12) C next quotient + lea 8(up), up + dec un + jne L(o1) + mov %r9, 24(up) + jmp L(ret) + +L(gt1): cmp $-5, dn + jnz L(out1) + +L(o5): mulx( -40,(dp), %rbx, %rax) + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + add -8(up), %rbx + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add (up), %r9 + mov %r9, (up) + mov %r8, %rdx C dinv + mulx( %r9, %rdx, %r12) C next quotient + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o5) + jmp L(ret) + +L(out1):lea 1(dn), i + .byte 0xc4,0xa2,0xe3,0xf6,0x04,0xf6 C mulx (dp,dn,8),%rbx,%rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%r9,%r8 + .byte 0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10 C mulx 16(dp,dn,8),%r11,%r10 + clc + jmp L(lo1) + + ALIGN(16) +L(top1):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) +L(lo1): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top1) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out1) + jmp L(ret) + +C ============================================================================= +L(b2): cmp $-2, dn + jnz L(gt2) + + mov 16(up), %r10 + mov 24(up), %r9 +L(o2): mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r12, %rbx + adc $0, %rax + add %r10, %r13 C add just to produce carry + mov %r9, %r10 + adc %rbx, %r10 + mov %r8, %rdx + mulx( %r10, %rdx, %r12) C next quotient + adc %rbp, %rax + setc R8(%rbp) + mov 32(up), %r9 + add %rax, %r9 + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o2) + mov %r10, 16(up) + mov %r9, 24(up) + jmp L(ret) + +L(gt2): cmp $-6, dn + jnz L(out2) + +L(o6): mulx( -48,(dp), %r13, %r12) + mulx( -40,(dp), %rcx, %rax) + add %r12, %rcx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + mulx( -24,(dp), %r11, %r10) + add -16(up), %r13 + mulx( -16,(dp), %r13, %r12) + adc -8(up), %rcx + mov %rcx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o6) + jmp L(ret) + +L(out2):lea 2(dn), i + .byte 0xc4,0x22,0x93,0xf6,0x24,0xf6 C mulx (dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08 C mulx 8(dp,dn,8),%rbx,%rax + add %r12, %rbx + adc $0, %rax + .byte 0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%r9,%r8 + jmp L(lo2) + + ALIGN(16) +L(top2):add %r9, (up,i,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) +L(lo2): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top2) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out2) + jmp L(ret) + +C ============================================================================= +L(b3): cmp $-3, dn + jnz L(gt3) + + mov 8(up), %r14 + mov 16(up), %r9 + mov 24(up), %rcx +L(o3): mulx( -24,(dp), %r11, %r10) + mulx( -16,(dp), %r13, %r12) + mulx( -8,(dp), %rbx, %rax) + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add %r14, %r11 + mov %r9, %r14 + adc %r13, %r14 + mov %rcx, %r9 + mov %r8, %rdx C dinv + mulx( %r14, %rdx, %r12) C next quotient + adc %rbx, %r9 + adc %rbp, %rax + setc R8(%rbp) + mov 32(up), %rcx + add %rax, %rcx + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o3) + mov %r14, 8(up) + mov %r9, 16(up) + mov %rcx, 24(up) + jmp L(ret) + +L(gt3): cmp $-7, dn + jnz L(out3) + +L(o7): mulx( -56,(dp), %r11, %r10) + mulx( -48,(dp), %rcx, %r12) + mulx( -40,(dp), %rbx, %rax) + add %r10, %rcx + adc %r12, %rbx + adc $0, %rax + mulx( -32,(dp), %r9, %r14) + add -24(up), %r11 + mulx( -24,(dp), %r11, %r10) + adc -16(up), %rcx + mov %rcx, -16(up) + mulx( -16,(dp), %r13, %r12) + adc %rbx, -8(up) + adc %rax, %r9 + mulx( -8,(dp), %rbx, %rax) + adc %r14, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov %r8, %rdx C dinv + mulx( %rcx, %rdx, %r12) C next quotient + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(o7) + jmp L(ret) + +L(out3):lea 3(dn), i + .byte 0xc4,0x22,0xa3,0xf6,0x14,0xf6 C mulx (dp,dn,8),%r11,%r10 + .byte 0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08 C mulx 8(dp,dn,8),%r13,%r12 + .byte 0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10 C mulx 16(dp,dn,8),%rbx,%rax + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(lo3) + + ALIGN(16) +L(top3):add %r9, (up,i,8) +L(lo3): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (dp,i,8), %r9, %r8 + adc %r11, 8(up,i,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(dp,i,8), %r11, %r10 + adc %r13, 16(up,i,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(dp,i,8), %r13, %r12 + adc %rbx, 24(up,i,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(dp,i,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, i + js L(top3) + + mov (%rsp), %rdx C dinv + .byte 0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28 C mulx 40(up,dn,8), %rdx, %r12 + add %r9, (up) + adc %r11, 8(up) + adc %r13, 16(up) + adc %rbx, 24(up) + adc %rbp, %rax + setc R8(%rbp) + add %rax, 32(up) + adc $0, R32(%rbp) + lea 8(up), up + dec un + jne L(out3) + +L(ret): mov %rbp, %rax + pop %rsi C dummy dealloc + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/zen/sqr_basecase.asm b/gcc/gmp/mpn/x86_64/zen/sqr_basecase.asm new file mode 100644 index 0000000..a7c6127 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/sqr_basecase.asm @@ -1,0 +1,482 @@ +dnl AMD64 mpn_sqr_basecase optimised for AMD Zen. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Do overlapped software pipelining. This should close the remaining gap to +C mul_basecase. +C +C * Update un just once in the outer loop. +C +C * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from +C loads and stores. At least in some cases, the non-scaled form is faster. +C +C * Optimise xit3 code, e.g., using shrx and sarx like in the main loop. +C +C * The mul_1 feed-in code has gotten little attention and could probably be +C improved. Perhaps even expand it to 4 separate loops to allow straight +C fall-through into the 4 addmul_1 loops. +C +C * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') + +define(`un', `%rbp') +define(`n', `%rcx') + +C these are used just for the small op code +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + + cmp $2, R32(un_param) + jae L(gt1) + + mov (up), %rdx + mulx( %rdx, %rax, %rdx) + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(gt1): jne L(gt2) + + mov (up), %rdx + mov 8(up), %rcx + mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2 + mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1 + mov %rcx, %rdx + mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3 + add %r9, %r9 C W 1 + adc %r10, %r10 C W 2 + adc $0, %rdx C W 3 + add %r9, %r8 C W 1 + adc %r11, %r10 C W 2 + adc $0, %rdx C W 3 + mov %rax, (rp) + mov %r8, 8(rp) + mov %r10, 16(rp) + mov %rdx, 24(rp) + FUNC_EXIT() + ret + +L(gt2): cmp $4, R32(un_param) + jae L(gt3) + + push %rbx + mov (up), %rdx + mulx( 8,(up), w2, w3) + mulx( 16,(up), w0, w1) + add w3, w0 + mov 8(up), %rdx + mulx( 16,(up), %rax, w3) + adc %rax, w1 + adc $0, w3 + test R32(%rbx), R32(%rbx) + mov (up), %rdx + mulx( %rdx, %rbx, %rcx) + mov %rbx, (rp) + mov 8(up), %rdx + mulx( %rdx, %rax, %rbx) + mov 16(up), %rdx + mulx( %rdx, %rsi, %rdx) + adcx( w2, w2) + adcx( w0, w0) + adcx( w1, w1) + adcx( w3, w3) + adox( w2, %rcx) + adox( w0, %rax) + adox( w1, %rbx) + adox( w3, %rsi) + mov $0, R32(%r8) + adox( %r8, %rdx) + adcx( %r8, %rdx) + mov %rcx, 8(rp) + mov %rax, 16(rp) + mov %rbx, 24(rp) + mov %rsi, 32(rp) + mov %rdx, 40(rp) + pop %rbx + FUNC_EXIT() + ret + +L(gt3): push %r15 +C push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + mov R32(un_param), R32(un) + + mov (up), %rdx C up[0] + mov 8(up), %r9 C up[1] + + mulx( %rdx, %rax, %r15) C up[0]^2 + mov %rax, (rp) + shl %rdx + + lea (up,un,8), up + lea -32(rp,un,8), rp + + neg un + lea 4(un), n + and $-4, n + + test $1, R8(un) + jnz L(mx0) +L(mx1): test $2, R8(un) + jz L(mb3) + +L(mb1): mulx( %r9, %rbx, %rax) + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18 C mulx 24(up,un,8), %r11, %r10 + add %r15, %rbx + jmp L(mlo1) + +L(mb3): mulx( %r9, %r11, %r10) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x10 C mulx 16(up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %rbx, %rax + add %r15, %r11 + jrcxz L(n4) + jmp L(mlo3) +L(n4): mov %r11, 8(rp) + adc %r10, %r13 + adc %r12, %rbx + jmp L(m) + +L(mx0): test $2, R8(un) + jnz L(mb0) + +L(mb2): mulx( %r9, %r13, %r12) + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 16(up,un,8), %rbx, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18 C mulx 24(up,un,8), %r9, %r8 + add %r15, %r13 + jmp L(mlo2) + +L(mb0): mulx( %r9, %r9, %r8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10 C mulx 16(up,un,8), %r11, %r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x18 C mulx 24(up,un,8), %r13, %r12 + add %r15, %r9 + jmp L(mlo0) + + ALIGN(16) +L(mtop):jrcxz L(mend) + adc %r8, %r11 + mov %r9, (rp,n,8) +L(mlo3):.byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r10, %r13 + mov %r11, 8(rp,n,8) +L(mlo2):.byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r12, %rbx + mov %r13, 16(rp,n,8) +L(mlo1):.byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rax, %r9 + mov %rbx, 24(rp,n,8) +L(mlo0):.byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + lea 4(n), n + jmp L(mtop) + +L(mend):mov %r9, (rp) + adc %r8, %r11 + mov %r11, 8(rp) + adc %r10, %r13 + mov %r13, 16(rp) + adc %r12, %rbx + adc $0, %rax + mov %rbx, 24(rp) + mov %rax, 32(rp) + + lea 2(un), un + + mov $63, R32(%r15) C keep at 63 for shrx/sarx. + test $1, R8(un) + jz L(x0) +L(x1): test $2, R8(un) + jz L(f3) + jmp L(f1) +L(x0): test $2, R8(un) + jz L(f0) +C jmp L(f2) + +L(f2): mov -8(up,un,8), %rdx C up[0] + lea 2(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r11 + .byte 0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r13 + and %rdx, %r11 C "ci" in C code + mulx( %rdx, %rax, %r10) C up[0]^2 + lea (%r13,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r11 + + .byte 0xc4,0x62,0x93,0xf6,0x24,0xee C mulx (up,un,8), %r13, %r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %rbx, %rax + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jmp L(b2) + + ALIGN(16) +L(top2):add %r9, (rp,n,8) +L(b2): .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top2) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f1): mov -8(up,un,8), %rdx C up[0] + lea 1(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r13 + .byte 0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %rbx + and %rdx, %r13 C "ci" in C code + mulx( %rdx, %rax, %r12) C up[0]^2 + lea (%rbx,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r13 + + .byte 0xc4,0xe2,0xe3,0xf6,0x04,0xee C mulx (up,un,8), %rbx, %rax + adc %r12, %rbx + adc $0, %rax + .byte 0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08 C mulx 8(up,un,8), %r9, %r8 + jmp L(b1) + + ALIGN(16) +L(top1):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) +L(b1): .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top1) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f0): mov -8(up,un,8), %rdx C up[0] + lea (un), n + lea 8(rp), rp + .byte 0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0 C sarx %r15, -16(up,un,8), %rbx + .byte 0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r9 + and %rdx, %rbx C "ci" in C code + mulx( %rdx, %r10, %rax) C up[0]^2 + lea (%r9,%rdx,2), %rdx C "u0" arg in C code + add %r10, %rbx + adc $0, %rax C "cin" in C code + + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,un,8), %r9, %r8 + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08 C mulx 8(up,un,8), %r11, %r10 + jmp L(b0) + + ALIGN(16) +L(top0):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) +L(b0): .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top0) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + +L(f3): mov -8(up,un,8), %rdx C up[0] + lea 3(un), n + lea 8(rp), rp + .byte 0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0 C sarx %r15, -16(up,un,8), %r9 + .byte 0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0 C shrx %r15, -16(up,un,8), %r11 + and %rdx, %r9 C "ci" in C code + mulx( %rdx, %rax, %r8) C up[0]^2 + lea (%r11,%rdx,2), %rdx C "u0" arg in C code + add %rax, %r9 + + .byte 0xc4,0x62,0xa3,0xf6,0x14,0xee C mulx (%rsi,%rbp,8),%r11,%r10 + .byte 0xc4,0x62,0x93,0xf6,0x64,0xee,0x08 C mulx 0x8(%rsi,%rbp,8),%r13,%r12 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10 C mulx 0x10(%rsi,%rbp,8),%rbx,%rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + jrcxz L(xit3) + jmp L(top3) C FIXME perhaps fall through + + ALIGN(16) +L(top3):add %r9, (rp,n,8) + .byte 0xc4,0x62,0xb3,0xf6,0x04,0xce C mulx (up,n,8), %r9, %r8 + adc %r11, 8(rp,n,8) + .byte 0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08 C mulx 8(up,n,8), %r11, %r10 + adc %r13, 16(rp,n,8) + .byte 0xc4,0x62,0x93,0xf6,0x64,0xce,0x10 C mulx 16(up,n,8), %r13, %r12 + adc %rbx, 24(rp,n,8) + adc %rax, %r9 + .byte 0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18 C mulx 24(up,n,8), %rbx, %rax + adc %r8, %r11 + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + add $4, n + jnz L(top3) + + inc un + add %r9, (rp) + adc %r11, 8(rp) + adc %r13, 16(rp) + adc %rbx, 24(rp) + adc $0, %rax + mov %rax, 32(rp) + jmp L(f2) + + +L(xit3):add %r9, (rp) + adc %r11, 8(rp) + adc 16(rp), %r13 + adc 24(rp), %rbx +L(m): adc $0, %rax + mov %rax, 32(rp) + mov -24(up), %rdx C FIXME: CSE + mov -32(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %r13, %r9 + mulx( %rdx, %rax, %r10) + mov -16(up), %r8 C FIXME: CSE + adc $0, %r10 + add %rax, %r9 + adc $0, %r10 + mov %r9, 16(rp) + mov -32(up), %rax + shl %rax + adc %rdx, %rdx + mulx( %r8, %r13, %r12) + mulx( -8,(up), %r11, %rax) C FIXME: CSE + add %r10, %r13 + adc %r12, %r11 + adc $0, %rax + add %rbx, %r13 + mov %r13, 24(rp) + adc 32(rp), %r11 + adc $0, %rax + mov -16(up), %rdx C FIXME: CSE + mov -8(up), %r8 C FIXME: CSE + mov -24(up), %r9 + sar $63, %r9 + and %rdx, %r9 + add %r11, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 32(rp) + mov -24(up), %rbp + shl %rbp + adc %rdx, %rdx + mulx( %r8, %rbx, %rbp) + add %r10, %rbx + adc $0, %rbp + adc %rbx, %rax + mov %rax, 40(rp) + adc $0, %rbp + mov -8(up), %rdx C FIXME: CSE + mov -16(up), %r9 C FIXME: CSE + sar $63, %r9 + and %rdx, %r9 + add %rbp, %r9 + mulx( %rdx, %rbp, %r10) + adc $0, %r10 + add %rbp, %r9 + adc $0, %r10 + mov %r9, 48(rp) + mov %r10, 56(rp) + + pop %rbx + pop %rbp + pop %r12 + pop %r13 +C pop %r14 + pop %r15 + + FUNC_EXIT() + ret +EPILOGUE() diff --git a/gcc/gmp/mpn/x86_64/zen/sublsh1_n.asm b/gcc/gmp/mpn/x86_64/zen/sublsh1_n.asm new file mode 100644 index 0000000..00f6dc9 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen/sublsh1_n.asm @@ -1,0 +1,37 @@ +dnl X86-64 mpn_sublsh1_n, mpn_sublsh1_nc. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') diff --git a/gcc/gmp/mpn/x86_64/zen2/gmp-mparam.h b/gcc/gmp/mpn/x86_64/zen2/gmp-mparam.h new file mode 100644 index 0000000..3748c5f 100644 --- /dev/null +++ b/gcc/gmp/mpn/x86_64/zen2/gmp-mparam.h @@ -1,0 +1,276 @@ +/* AMD Zen2 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* Disable use of slow functions. FIXME: We should disable lib inclusion. */ +#undef HAVE_NATIVE_mpn_mul_2 +#undef HAVE_NATIVE_mpn_addmul_2 + +/* 3600-4400 MHz Matisse */ +/* FFT tuning limit = 703,392,483 */ +/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 27 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 1 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD 13 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 22 + +#define DIV_1_VS_MUL_1_PERCENT 385 + +#define MUL_TOOM22_THRESHOLD 19 +#define MUL_TOOM33_THRESHOLD 125 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 276 +#define MUL_TOOM8H_THRESHOLD 369 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 121 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 129 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 132 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 185 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 30 +#define SQR_TOOM3_THRESHOLD 117 +#define SQR_TOOM4_THRESHOLD 315 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 527 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 14 +#define SQRMOD_BNM1_THRESHOLD 20 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 49, 9}, { 27,10}, \ + { 15, 9}, { 31, 8}, { 63, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,10}, \ + { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543, 9}, { 1087,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 543,10}, { 1087,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 639,10}, { 1279,11}, { 671,10}, \ + { 1343,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,11}, { 799,12}, { 415,11}, { 831,10}, \ + { 1663,12}, { 447,11}, { 895,12}, { 479,14}, \ + { 127,13}, { 255,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,13}, { 319,12}, { 639,11}, \ + { 1279,12}, { 671,11}, { 1343,10}, { 2687,12}, \ + { 703,11}, { 1471,10}, { 2943,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,11}, { 1663,13}, { 447,12}, { 959,11}, \ + { 1919,10}, { 3839,14}, { 255,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,11}, { 3199,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,12}, \ + { 1919,11}, { 3839,14}, { 511,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,12}, { 2943,11}, \ + { 5887,14}, { 767,13}, { 1599,12}, { 3199,13}, \ + { 1727,12}, { 3455,14}, { 895,13}, { 1919,12}, \ + { 3839,11}, { 7679,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,12}, { 8959,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,15}, { 1791,14}, { 3839,13}, \ + { 7679,14}, { 3967,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,16}, { 1535,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,13}, \ + { 15359,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,17}, { 2047,16}, \ + { 4095,15}, { 8959,16}, { 4607,15}, { 9983,14}, \ + { 19967,16}, { 5631,15}, { 11775,17}, { 3071,16}, \ + { 7679,15}, { 15871,18}, { 2047,17}, { 4095,16}, \ + { 9727,15}, { 19967,17}, { 5119,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 275 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 396 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 396, 5}, { 25, 6}, { 25, 7}, { 13, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,10}, { 575,11}, { 303,12}, { 159,11}, \ + { 319,10}, { 639,11}, { 335,10}, { 671, 9}, \ + { 1343,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,10}, { 1215,12}, { 319,11}, { 639,10}, \ + { 1279,11}, { 671,10}, { 1343,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,12}, { 447,11}, \ + { 895,12}, { 479,11}, { 959,14}, { 127,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,10}, { 2687,12}, { 703,11}, \ + { 1407,12}, { 735,11}, { 1471,10}, { 2943,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,11}, { 1663,13}, { 447,12}, \ + { 959,11}, { 1919,10}, { 3839,13}, { 511,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1471,11}, { 2943,14}, { 383,13}, \ + { 767,12}, { 1599,13}, { 831,12}, { 1727,11}, \ + { 3455,13}, { 959,12}, { 1919,11}, { 3839,14}, \ + { 511,13}, { 1023,12}, { 2047,13}, { 1087,12}, \ + { 2175,13}, { 1215,12}, { 2431,11}, { 4863,14}, \ + { 639,13}, { 1343,12}, { 2687,13}, { 1471,12}, \ + { 2943,11}, { 5887,14}, { 767,13}, { 1599,12}, \ + { 3199,13}, { 1727,12}, { 3455,14}, { 895,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1279,13}, { 2687,14}, { 1407,13}, { 2943,12}, \ + { 5887,15}, { 767,14}, { 1535,13}, { 3199,14}, \ + { 1663,13}, { 3455,12}, { 6911,14}, { 1919,13}, \ + { 3839,12}, { 7679,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,14}, { 3967,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5247,15}, { 2815,14}, { 5887,13}, \ + { 11775,16}, { 1535,15}, { 3071,14}, { 6143,15}, \ + { 3327,14}, { 6911,15}, { 3839,14}, { 7679,17}, \ + { 1023,16}, { 2047,15}, { 4095,14}, { 8191,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 7679,15}, \ + { 15359,18}, { 2047,17}, { 4095,16}, { 9727,15}, \ + { 19967,17}, { 5119,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 282 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 57 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 43 +#define DC_DIVAPPR_Q_THRESHOLD 154 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 36 +#define INV_NEWTON_THRESHOLD 141 +#define INV_APPR_THRESHOLD 149 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_N_THRESHOLD 47 + +#define MU_DIV_QR_THRESHOLD 1470 +#define MU_DIVAPPR_Q_THRESHOLD 1528 +#define MUPI_DIV_QR_THRESHOLD 47 +#define MU_BDIV_QR_THRESHOLD 1187 +#define MU_BDIV_Q_THRESHOLD 1589 + +#define POWM_SEC_TABLE 3,22,194,579 + +#define GET_STR_DC_THRESHOLD 12 +#define GET_STR_PRECOMPUTE_THRESHOLD 19 +#define SET_STR_DC_THRESHOLD 195 +#define SET_STR_PRECOMPUTE_THRESHOLD 1752 + +#define FAC_DSC_THRESHOLD 345 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD2_DIV1_METHOD 1 /* 11.29% faster than 3 */ +#define HGCD_THRESHOLD 89 +#define HGCD_APPR_THRESHOLD 96 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 25.56% faster than 4 */ + +/* Tuneup completed successfully, took 294200 seconds */ diff --git a/gcc/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm b/gcc/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm @@ -1,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gcc/gmp/mpn/arm/v7a/cora17/addmul_1.asm b/gcc/gmp/mpn/arm/v7a/cora17/addmul_1.asm new file mode 100644 index 0000000..c11ed47 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora17/addmul_1.asm @@ -1,0 +1,34 @@ +dnl ARM mpn_addmul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`arm/v6/addmul_1.asm') diff --git a/gcc/gmp/mpn/arm/v7a/cora17/gmp-mparam.h b/gcc/gmp/mpn/arm/v7a/cora17/gmp-mparam.h new file mode 100644 index 0000000..143d4bc 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora17/gmp-mparam.h @@ -1,0 +1,233 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1800 MHz Cortex-A17 with Neon (in spite of file position) */ +/* FFT tuning limit = 51243975 */ +/* Generated by tuneup.c, 2019-10-29, gcc 6.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 54.08% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 45 + +#define DIV_1_VS_MUL_1_PERCENT 248 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 132 +#define MUL_TOOM44_THRESHOLD 200 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 179 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 191 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 62 +#define SQR_TOOM3_THRESHOLD 189 +#define SQR_TOOM4_THRESHOLD 354 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 29 + +#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 595, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 83, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 143, 8}, { 575,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767, 9}, { 399, 8}, \ + { 799,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023, 9}, { 543, 8}, { 1087, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367, 9}, { 735,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415, 9}, { 831,10}, { 431, 9}, { 863,11}, \ + { 223,10}, { 447,12}, { 127,10}, { 511, 9}, \ + { 1023,10}, { 543, 9}, { 1087,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,11}, { 447,10}, \ + { 895,13}, { 127,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 991,10}, \ + { 1983,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1983,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2559,13}, \ + { 5247,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 194 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 399,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,10}, { 367, 9}, \ + { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415, 9}, { 831,10}, \ + { 431, 9}, { 863,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 735,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,10}, { 1919,11}, { 991,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 199 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 26 +#define SQRLO_SQR_THRESHOLD 8907 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 44 +#define DC_BDIV_Q_THRESHOLD 98 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 115 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 2 +#define REDC_2_TO_REDC_N_THRESHOLD 147 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 2089 +#define MUPI_DIV_QR_THRESHOLD 70 +#define MU_BDIV_QR_THRESHOLD 1718 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define POWM_SEC_TABLE 7,19,107,480,1486 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 126 +#define SET_STR_PRECOMPUTE_THRESHOLD 541 + +#define FAC_DSC_THRESHOLD 132 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 30 +#define HGCD2_DIV1_METHOD 1 /* 6.55% faster than 3 */ +#define HGCD_THRESHOLD 54 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 303 +#define GCDEXT_DC_THRESHOLD 225 +#define JACOBI_BASE_METHOD 4 /* 9.73% faster than 1 */ + +/* Tuneup completed successfully, took 111418 seconds */ diff --git a/gcc/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm b/gcc/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm new file mode 100644 index 0000000..39e5a15 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm @@ -1,0 +1,121 @@ +dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A5 2.67 +C Cortex-A7 2.37 +C Cortex-A8 2.34 +C Cortex-A9 ? +C Cortex-A15 1.39 +C Cortex-A17 1.60 +C Cortex-A53 2.51 + +define(`ap', r0) +define(`n', r1) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Write cleverer summation code. +C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l. + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + push { r4, r5, r6, r7 } + + subs n, n, #3 + mov r7, #0 + blt L(le2) C n <= 2 + + ldmia ap!, { r2, r3, r12 } + subs n, n, #3 + blt L(sum) C n <= 5 + mov r7, #0 + b L(mid) + +L(top): adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 +L(mid): ldmia ap!, { r4, r5, r6 } + subs n, n, #3 + bpl L(top) + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 C r7 <= 1 + +L(sum): cmn n, #2 + movlo r4, #0 + ldrhs r4, [ap], #4 + movls r5, #0 + ldrhi r5, [ap], #4 + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, #0 + adc r7, r7, #0 C r7 <= 2 + +L(sum2): + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + add r0, r0, r7 + + mov r7, r3, lsl #8 + bic r2, r7, #0xff000000 + add r0, r0, r2 + add r0, r0, r3, lsr #16 + + mov r2, r12, lsl #16 + bic r1, r2, #0xff000000 + add r0, r0, r1 + add r0, r0, r12, lsr #8 + + pop { r4, r5, r6, r7 } + return lr + +L(le2): cmn n, #1 + bne L(1) + ldmia ap!, { r2, r3 } + mov r12, #0 + b L(sum2) +L(1): ldr r2, [ap] + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + pop { r4, r5, r6, r7 } + return lr +EPILOGUE() diff --git a/gcc/gmp/mpn/arm/v7a/cora17/mul_1.asm b/gcc/gmp/mpn/arm/v7a/cora17/mul_1.asm new file mode 100644 index 0000000..d9b6042 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora17/mul_1.asm @@ -1,0 +1,34 @@ +dnl ARM mpn_mul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mul_1) +include_mpn(`arm/v6/mul_1.asm') diff --git a/gcc/gmp/mpn/arm/v7a/cora17/submul_1.asm b/gcc/gmp/mpn/arm/v7a/cora17/submul_1.asm new file mode 100644 index 0000000..f3e8139 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora17/submul_1.asm @@ -1,0 +1,34 @@ +dnl ARM mpn_submul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_submul_1) +include_mpn(`arm/v6/submul_1.asm') diff --git a/gcc/gmp/mpn/arm/v7a/cora5/gmp-mparam.h b/gcc/gmp/mpn/arm/v7a/cora5/gmp-mparam.h new file mode 100644 index 0000000..e3564e0 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora5/gmp-mparam.h @@ -1,0 +1,205 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1500 MHz Cortex-A5 (odroid c1) */ +/* FFT tuning limit = 18,235,562 */ +/* Generated by tuneup.c, 2019-10-22, gcc 4.9 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 23 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 132.79% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 52 + +#define DIV_1_VS_MUL_1_PERCENT 213 + +#define MUL_TOOM22_THRESHOLD 48 +#define MUL_TOOM33_THRESHOLD 143 +#define MUL_TOOM44_THRESHOLD 262 +#define MUL_TOOM6H_THRESHOLD 414 +#define MUL_TOOM8H_THRESHOLD 527 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 153 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 168 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 180 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 66 +#define SQR_TOOM3_THRESHOLD 149 +#define SQR_TOOM4_THRESHOLD 348 +#define SQR_TOOM6_THRESHOLD 517 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 26 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define MUL_FFT_MODF_THRESHOLD 660 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 660, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 37, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 895,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1183,12}, { 639,11}, \ + { 1279,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 140 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 590 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 590, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \ + { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 144 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 14709 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 33 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 116 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 140 + +#define INV_MULMOD_BNM1_THRESHOLD 95 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 125 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 152 + +#define MU_DIV_QR_THRESHOLD 2350 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 98 +#define MU_BDIV_QR_THRESHOLD 1970 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define POWM_SEC_TABLE 6,37,108,624,2351 + +#define GET_STR_DC_THRESHOLD 28 +#define GET_STR_PRECOMPUTE_THRESHOLD 44 +#define SET_STR_DC_THRESHOLD 309 +#define SET_STR_PRECOMPUTE_THRESHOLD 762 + +#define FAC_DSC_THRESHOLD 236 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 2.92% faster than 3 */ +#define HGCD_THRESHOLD 70 +#define HGCD_APPR_THRESHOLD 59 +#define HGCD_REDUCE_THRESHOLD 4120 +#define GCD_DC_THRESHOLD 229 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 17.07% faster than 4 */ + +/* Tuneup completed successfully, took 47845 seconds */ diff --git a/gcc/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm b/gcc/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm new file mode 100644 index 0000000..e74b260 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm @@ -1,0 +1,158 @@ +dnl ARM v6 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. +dnl This is v6 code but it runs well on just the v7a Cortex-A8, A9, and A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 - - +C Cortex-A5 9 13 +C Cortex-A7 12 18 +C Cortex-A8 13 14 +C Cortex-A9 9 10 not measured since latest edits +C Cortex-A15 7 7 +C Cortex-A53 16 24 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r4') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + push {r6-r11} + + rsb r10, d, #0 + and r10, r10, d + clz r10, r10 + rsbs cnt, r10, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + cmp cnt, #0 + +L(pi1): ldr r11, [up], #4 C up[0] + mov cy, #0 + rsb r8, r10, #0 C r8 = -inverse + bne L(unorm) + +L(norm): + subs n, n, #1 + mul r11, r11, r10 + beq L(edn) + + ALIGN(16) +L(tpn): ldr r9, [up], #4 + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpn) + +L(edn): str r11, [rp] + pop {r6-r11} + bx r14 + +L(unorm): + push {r4-r5} + rsb tnc, cnt, #32 + mov r5, r11, lsr cnt + subs n, n, #1 + beq L(ed1) + + ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mul r11, r9, r10 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpu) + +L(edu): str r11, [rp], #4 + mov r12, #0 + umaal r12, cy, r11, d + mul r11, r5, r10 + mla r11, cy, r8, r11 + str r11, [rp] + pop {r4-r11} + bx r14 + +L(ed1): mul r11, r5, r10 + str r11, [rp] + pop {r4-r11} + bx r14 +EPILOGUE() diff --git a/gcc/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm b/gcc/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 100644 --- /dev/null +++ b/gcc/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm @@ -1,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm b/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm new file mode 100644 index 0000000..f04e896 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm @@ -1,0 +1,67 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 7.6 obsolete +C POWER8 ? +C POWER9 ? +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +C INPUT PARAMETERS +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + li r12, 63 + b L(odd) + + ALIGN(16) +L(top): and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r12 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): cmpld cr7, v0, u0 + subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm b/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm new file mode 100644 index 0000000..ade30e4 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm @@ -1,0 +1,146 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER7 and POWER8. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 12.3 +C POWER8 13.4 +C POWER9 10.6 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) +L(top): subfc. t0, v0, u0 C 0 12 + beq cr0, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): and tmp, s0, t0 C 2 + cntlzd cnt, tmp C 4 + addi tmp, cnt, 1 C 6 + subfic cnt, cnt, 63 C 6 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel v1, v1, u1, 2 C 6 + isel u0, t0, s0, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + bne L(top) + + + li r0, 63 + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + subf cnt, cnt, r0 C cnt = 63-cnt + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + and r8, r11, r10 C isolate lsb + cntlzd cnt, r8 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) C zero +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h b/gcc/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h new file mode 100644 index 0000000..ed4db28 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h @@ -1,0 +1,170 @@ +/* POWER8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 4150 MHz POWER8/SMT4 */ +/* FFT tuning limit = 0.5 M */ +/* Generated by tuneup.c, 2019-09-24, gcc 7.2 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 22 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 10 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 2 /* 16.97% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD 9 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 34 + +#define DIV_1_VS_MUL_1_PERCENT 276 + +#define MUL_TOOM22_THRESHOLD 18 +#define MUL_TOOM33_THRESHOLD 73 +#define MUL_TOOM44_THRESHOLD 195 +#define MUL_TOOM6H_THRESHOLD 278 +#define MUL_TOOM8H_THRESHOLD 406 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 131 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 121 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 138 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 32 +#define SQR_TOOM3_THRESHOLD 97 +#define SQR_TOOM4_THRESHOLD 178 +#define SQR_TOOM6_THRESHOLD 303 +#define SQR_TOOM8_THRESHOLD 454 + +#define MULMID_TOOM42_THRESHOLD 42 + +#define MULMOD_BNM1_THRESHOLD 15 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 12, 6}, \ + { 25, 7}, { 21, 8}, { 11, 7}, { 25, 8}, \ + { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 63,10}, { 39, 9}, { 79,10}, { 47, 9}, \ + { 95,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 79,11}, { 47,10}, { 95,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255,10}, { 135,11}, \ + { 79,10}, { 159,11}, { 95, 8}, { 767, 7}, \ + { 1599,11}, { 111,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,11}, { 143,10}, { 287, 9}, \ + { 575,11}, { 159,12}, { 95,11}, { 191,10}, \ + { 383,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 80 +#define MUL_FFT_THRESHOLD 4736 + +#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 340, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 21, 7}, { 11, 6}, { 23, 7}, { 21, 8}, \ + { 11, 7}, { 25, 8}, { 13, 7}, { 27, 8}, \ + { 15, 7}, { 31, 8}, { 21, 9}, { 11, 8}, \ + { 27, 9}, { 15, 8}, { 33, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 47,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,11}, { 79, 9}, { 319,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303, 9}, { 607,10}, \ + { 319,12}, { 95,11}, { 191,10}, { 383,13}, \ + { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 71 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 9174 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 114 +#define SQRLO_SQR_THRESHOLD 6461 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 158 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 112 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 132 +#define INV_APPR_THRESHOLD 131 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_2_THRESHOLD 56 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1142 +#define MU_DIVAPPR_Q_THRESHOLD 1142 +#define MUPI_DIV_QR_THRESHOLD 46 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1470 + +#define POWM_SEC_TABLE 3,19,117,672,1867 + +#define GET_STR_DC_THRESHOLD 11 +#define GET_STR_PRECOMPUTE_THRESHOLD 18 +#define SET_STR_DC_THRESHOLD 608 +#define SET_STR_PRECOMPUTE_THRESHOLD 2405 + +#define FAC_DSC_THRESHOLD 164 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 1 /* 6.88% faster than 3 */ +#define HGCD_THRESHOLD 114 +#define HGCD_APPR_THRESHOLD 118 +#define HGCD_REDUCE_THRESHOLD 2205 +#define GCD_DC_THRESHOLD 440 +#define GCDEXT_DC_THRESHOLD 345 +#define JACOBI_BASE_METHOD 1 /* 0.74% faster than 4 */ diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm new file mode 100644 index 0000000..2426a00 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm @@ -1,0 +1,112 @@ +dnl PowerPC-64 mpn_add_n_sub_n optimised for POWER9. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.25 + + +C INPUT PARAMETERS +define(`arp', `r3') +define(`srp', `r4') +define(`up', `r5') +define(`vp', `r6') +define(`n', `r7') + +ASM_START() +PROLOGUE(mpn_add_n_sub_n) + cmpdi cr7, n, 2 + subfo r0, r0, r0 C clear OV + rldicl. r9, n, 0, 63 C n & 1 + beq cr0, L(bx0) + +L(bx1): ld r10, 0(up) + ld r11, 0(vp) + ble cr7, L(1) + srdi r7, r7, 1 + mtctr r7 + ld r8, 8(up) + ld r9, 8(vp) + addex( r0, r10, r11, 0) + subfc r12, r11, r10 + addi up, up, -8 + addi vp, vp, -8 + b L(lo1) + +L(bx0): ld r8, 0(up) + ld r9, 0(vp) + ld r10, 8(up) + ld r11, 8(vp) + addex( r0, r8, r9, 0) + subfc r12, r9, r8 + addi arp, arp, 8 + addi srp, srp, 8 + ble cr7, L(end) + addi r7, r7, -1 + srdi r7, r7, 1 + mtctr r7 + +L(top): ld r8, 16(up) + ld r9, 16(vp) + std r0, -8(arp) + std r12, -8(srp) + addex( r0, r10, r11, 0) + subfe r12, r11, r10 +L(lo1): ld r10, 24(up) + ld r11, 24(vp) + std r0, 0(arp) + std r12, 0(srp) + addex( r0, r8, r9, 0) + subfe r12, r9, r8 + addi up, up, 16 + addi vp, vp, 16 + addi arp, arp, 16 + addi srp, srp, 16 + bdnz L(top) + +L(end): std r0, -8(arp) + std r12, -8(srp) +L(1): addex( r0, r10, r11, 0) + subfe r12, r11, r10 + std r0, 0(arp) + std r12, 0(srp) + subfe r3, r3, r3 + addex( r3, r3, r3, 0) + rldicl r3, r3, 1, 62 + blr +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm new file mode 100644 index 0000000..8f49606 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm @@ -1,0 +1,130 @@ +dnl Power9 mpn_addmul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 2.5 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../aorsmul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_addmul_1) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r0, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r10, 0(rp) + ld r12, 0(up) + ld r11, 8(rp) + ld r0, 8(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + ble cr6, L(2) + ld r10, 16(rp) + ld r12, 16(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r11, 0(rp) + ld r0, 0(up) + ble cr6, L(1) + ld r10, 8(rp) + ld r12, 8(up) + maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + ld r11, 16(rp) + ld r0, 16(up) + maddld( r9, r12, v0, r10) + maddhdu(r7, r12, v0, r10) + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r10, 24(rp) + ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + maddld( r8, r0, v0, r11) C W:0,2,4 + maddhdu(r5, r0, v0, r11) C W:1,3,5 +L(mid): ld r11, 32(rp) + ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + maddld( r9, r12, v0, r10) C W:1,3,5 + maddhdu(r7, r12, v0, r10) C W:2,4,6 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + maddld( r8, r0, v0, r11) + adde r9, r5, r9 + maddhdu(r5, r0, v0, r11) + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): maddld( r8, r0, v0, r11) + maddhdu(r5, r0, v0, r11) + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r11) + std r8, 0(rp) + maddhdu(r3, r0, v0, r11) + blr +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm new file mode 100644 index 0000000..1dd59ea 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm @@ -1,0 +1,182 @@ +dnl Power9 mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.62 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, r1, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r28, 0(rp) + ld r8, 0(up) + ld r11, 8(rp) + ld r9, 8(up) + maddld( r26, r8, v0, r28) + maddhdu(r31, r8, v0, r28) + blt cr7, L(2) + ld r28, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r29, 0(rp) + ld r9, 0(up) + ld r10, 8(rp) + ld r8, 8(up) + maddld( r27, r9, v0, r29) + maddhdu(r30, r9, v0, r29) + ld r29, 16(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + ld r28, 24(rp) + std r0, 0(rp) + maddld( r5, r8, v1, r29) C 1 5 -> addex + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r27, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r26, r0 C 8 12 + ld r29, 32(rp) + std r0, 8(rp) + maddld( r12, r9, v1, r28) C 2 6 -> addex + maddhdu(r11, r9, v1, r28) C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r26, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r27, r0 C 5 9 13 + ld r28, 40(rp) + std r0, 16(rp) + maddld( r5, r8, v1, r29) C 3 7 -> addex + maddhdu(r10, r8, v1, r29) C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r27, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r26, r0 C 6 10 + ld r29, 48(rp) + std r0, 24(rp) + maddld( r12, r9, v1, r28) C 4 8 -> addex + maddhdu(r11, r9, v1, r28) C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r26, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r27, r0 C 7 11 + std r0, 0(rp) C -4 + maddld( r5, r8, v1, r29) C 1 5 + maddhdu(r10, r8, v1, r29) C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r26, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r27, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm new file mode 100644 index 0000000..e4ca3a8 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm @@ -1,0 +1,179 @@ +dnl POWER9 mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C mpn_addmul_1 mpn_submul_1 +C cycles/limb cycles/limb +C POWER3/PPC630 - - +C POWER4/PPC970 - - +C POWER5 - - +C POWER6 - - +C POWER7 - - +C POWER8 - - +C POWER9 2.63 2.63 + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUBC', adde) + define(`ADDSUB', addc) + define(`func', mpn_addmul_1) + define(`AM', `$1') + define(`SM', `') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUBC', subfe) + define(`ADDSUB', subfc) + define(`func', mpn_submul_1) + define(`AM', `') + define(`SM', `$1') +') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + cmpdi cr7, n, 3 + srdi r10, n, 2 + mtctr r10 + rldicl. r9, n, 0, 63 + ld r11, 0(up) + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 +AM(` subfzeo r12, n ') C ov = 0, ca = 0 +AM(` li r12, 0 ') +SM(` subfco r12, r12, r12 ') C r12 = 0, ov = 0, ca = 1 + ld r9, 8(up) + mulld r0, r11, v0 + mulhdu r5, r11, v0 + blt cr7, L(2) + ld r8, 16(up) + bne cr0, L(b10) + +L(b00): addi rp, rp, -24 + b L(lo0) +L(b10): addi rp, rp, -8 + addi up, up, 16 + b L(lo2) + +L(2): addi rp, rp, -8 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 +AM(` subfzeo r5, n ') C ov = 0, ca = 0 +AM(` li r5, 0 ') +SM(` subfco r5, r5, r5 ') C r5 = 0, ov = 0, ca = 1 + blt cr7, L(1) + ld r8, 8(up) + mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r9, 16(up) + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + addi up, up, 8 + b L(lo1) + +L(1): mulld r7, r11, v0 + mulhdu r12, r11, v0 + ld r11, 0(rp) + ADDSUB r10, r7, r11 + std r10, 0(rp) +AM(` addze r3, r12 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r12, r0 ') + blr + +L(b11): addi up, up, 24 + ble cr7, L(end) + + ALIGN(16) +L(top): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 0(up) + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(lo2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 8(up) + ADDSUBC r10, r0, r11 + std r10, 8(rp) +L(lo1): ld r11, 16(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ld r8, 16(up) + ADDSUBC r10, r7, r11 + std r10, 16(rp) +L(lo0): ld r11, 24(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ld r9, 24(up) + ADDSUBC r10, r0, r11 + std r10, 24(rp) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r11, 0(rp) + mulld r0, r8, v0 + addex( r7, r7, r5, 0) + mulhdu r5, r8, v0 + ADDSUBC r10, r7, r11 + std r10, 0(rp) +L(cj2): ld r11, 8(rp) + mulld r7, r9, v0 + addex( r0, r0, r12, 0) + mulhdu r12, r9, v0 + ADDSUBC r10, r0, r11 + std r10, 8(rp) + ld r11, 16(rp) + addex( r7, r7, r5, 0) + ADDSUBC r10, r7, r11 + std r10, 16(rp) + li r0, 0 + addex( r3, r12, r0, 0) +AM(` addze r3, r3 ') +SM(` subfe r0, r0, r0 ') +SM(` sub r3, r3, r0 ') + blr +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm new file mode 100644 index 0000000..2dc982d 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm @@ -1,0 +1,64 @@ +dnl PowerPC-64 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 5.75 +C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1 + +define(`u0', `r3') +define(`v0', `r4') + +define(`cnt', `r9')dnl + +ASM_START() +PROLOGUE(mpn_gcd_11) + b L(odd) + + ALIGN(16) +L(top): isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |v - u| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top) + +L(end): blr +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm new file mode 100644 index 0000000..12d11b0 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm @@ -1,0 +1,143 @@ +dnl PowerPC-64 mpn_gcd_22 optimised for POWER9. + +dnl Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 9.58 + +C We define SLOW if this target uses a slow struct return mechanism, with +C r3 as an implicit parameter for the struct pointer. +undefine(`SLOW')dnl +ifdef(`AIX',`define(`SLOW',`due to AIX')',` + ifdef(`DARWIN',,` + ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl + ') +') + +ifdef(`SLOW',` +define(`IFSLOW', `$1') +define(`u1', `r4') +define(`u0', `r5') +define(`v1', `r6') +define(`v0', `r7') +',` +define(`IFSLOW', `') +define(`u1', `r3') +define(`u0', `r4') +define(`v1', `r5') +define(`v0', `r6') +') + +define(`tmp', `r0') +define(`t0', `r8') +define(`t1', `r9') +define(`s0', `r10') +define(`s1', `r11') +define(`cnt', `r12') + +ASM_START() +PROLOGUE(mpn_gcd_22) + cmpld cr7, v0, u0 +L(top): subfc t0, v0, u0 C 0 12 + beq cr7, L(lowz) + subfe t1, v1, u1 C 2 14 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subfc s0, u0, v0 C 0 + subfe s1, u1, v1 C 2 + +L(bck): cnttzd cnt, t0 C 2 + subfic tmp, cnt, 64 C 4 + + isel v0, v0, u0, 2 C 6 use condition set by subfe + isel u0, t0, s0, 2 C 6 + isel v1, v1, u1, 2 C 6 + isel u1, t1, s1, 2 C 6 + + srd u0, u0, cnt C 8 + sld tmp, u1, tmp C 8 + srd u1, u1, cnt C 8 + or u0, u0, tmp C 10 + + or. r0, u1, v1 C 10 + cmpld cr7, v0, u0 + bne L(top) + + + b L(odd) + ALIGN(16) +L(top1):isel v0, u0, v0, 29 C v = min(u,v) + isel u0, r10, r11, 29 C u = |u - v| + srd u0, u0, cnt +L(odd): subf r10, u0, v0 C r10 = v - u + subf r11, v0, u0 C r11 = u - v + cmpld cr7, v0, u0 + cnttzd cnt, r10 + bne cr7, L(top1) + +ifdef(`SLOW',` + std v0, 0(r3) + std r10, 8(r3) +',` + mr r3, v0 + li r4, 0 +') + blr + + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subfc. t0, v1, u1 C 2 8 + beq L(end) + li t1, 0 + subfe. tmp, tmp, tmp C 4 set cr0 from the carry bit + subf s0, u1, v1 C 2 + li s1, 0 + b L(bck) + +L(end): +ifdef(`SLOW',` + std v0, 0(r3) + std v1, 8(r3) + blr +',` + mr r3, v0 + mr r4, v1 + blr +') +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h b/gcc/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h new file mode 100644 index 0000000..5650def 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h @@ -1,0 +1,253 @@ +/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 2200MHz POWER9 */ +/* FFT tuning limit = 221,245,838 */ +/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 7 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 44 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 11 +#define USE_PREINV_DIVREM_1 0 +#define DIV_QR_1N_PI1_METHOD 2 /* 19.28% faster than 1 */ +#define DIV_QR_1_NORM_THRESHOLD 3 +#define DIV_QR_1_UNNORM_THRESHOLD 2 +#define DIV_QR_2_PI2_THRESHOLD 7 +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 33 + +#define DIV_1_VS_MUL_1_PERCENT 365 + +#define MUL_TOOM22_THRESHOLD 34 +#define MUL_TOOM33_THRESHOLD 109 +#define MUL_TOOM44_THRESHOLD 458 +#define MUL_TOOM6H_THRESHOLD 517 +#define MUL_TOOM8H_THRESHOLD 608 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 292 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 204 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 178 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 158 +#define SQR_TOOM4_THRESHOLD 674 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 898 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 404, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 13, 5}, { 27, 6}, { 27, 7}, { 14, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 39, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 31, 8}, \ + { 63, 9}, { 35, 8}, { 71, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,11}, \ + { 143,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,12}, { 95,11}, { 191,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,10}, { 703,11}, { 367,10}, \ + { 735,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,10}, \ + { 895,11}, { 479,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,11}, \ + { 703,10}, { 1407,11}, { 735,13}, { 191,12}, \ + { 383,11}, { 767,10}, { 1535,11}, { 799,12}, \ + { 415,11}, { 831,10}, { 1663,11}, { 863,12}, \ + { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,12}, { 703,11}, { 1407,12}, { 735,11}, \ + { 1471,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,11}, { 1663,13}, \ + { 447,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1087,11}, { 2175,13}, \ + { 575,12}, { 1215,13}, { 639,12}, { 1343,13}, \ + { 703,12}, { 1471,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,13}, { 895,11}, \ + { 3583,12}, { 1919,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2943,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1919,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,15}, { 1535,14}, \ + { 3455,13}, { 6911,15}, { 1791,14}, { 3839,13}, \ + { 7679,16}, { 1023,15}, { 2047,14}, { 4351,15}, \ + { 2303,14}, { 4863,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,17}, { 1023,16}, { 2047,15}, { 4351,14}, \ + { 8959,15}, { 4863,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 32768,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 243 +#define MUL_FFT_THRESHOLD 3712 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 29, 7}, { 15, 6}, { 31, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 35, 8}, { 29, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 135,11}, { 79,10}, \ + { 159,11}, { 95,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 143,10}, \ + { 287, 9}, { 575,10}, { 303,11}, { 159,12}, \ + { 95,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ + { 575,11}, { 303,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,11}, { 367,10}, { 735,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,12}, { 223,11}, \ + { 447,10}, { 895,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 671,12}, { 351,11}, { 703,10}, { 1407,11}, \ + { 735,13}, { 191,12}, { 383,11}, { 767,10}, \ + { 1535,12}, { 415,11}, { 831,12}, { 447,11}, \ + { 895,12}, { 479,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,12}, \ + { 575,11}, { 1151,12}, { 607,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 799,11}, { 1599,12}, \ + { 831,13}, { 447,12}, { 895,11}, { 1791,12}, \ + { 959,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,13}, { 575,12}, { 1215,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,13}, { 831,12}, \ + { 1727,13}, { 895,12}, { 1791,13}, { 959,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1215,14}, { 639,13}, \ + { 1343,12}, { 2687,13}, { 1471,14}, { 767,13}, \ + { 1599,12}, { 3199,13}, { 1727,14}, { 895,13}, \ + { 1919,15}, { 511,14}, { 1023,13}, { 2175,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1279,13}, \ + { 2687,14}, { 1407,13}, { 2815,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,14}, \ + { 1919,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4479,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,13}, { 5887,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,16}, { 1023,15}, \ + { 2047,14}, { 4479,15}, { 2303,14}, { 4863,15}, \ + { 2559,14}, { 5119,15}, { 2815,14}, { 5887,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,17}, \ + { 1023,16}, { 2047,15}, { 4351,14}, { 8959,15}, \ + { 4863,16}, { 2559,15}, { 5887,14}, { 11775,16}, \ + { 3071,15}, { 32768,16}, { 65536,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 230 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 3 +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 7246 +#define SQRLO_BASECASE_THRESHOLD 6 +#define SQRLO_DC_THRESHOLD 40 +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 30 +#define DC_DIVAPPR_Q_THRESHOLD 88 +#define DC_BDIV_QR_THRESHOLD 35 +#define DC_BDIV_Q_THRESHOLD 62 + +#define INV_MULMOD_BNM1_THRESHOLD 79 +#define INV_NEWTON_THRESHOLD 11 +#define INV_APPR_THRESHOLD 11 + +#define BINV_NEWTON_THRESHOLD 264 +#define REDC_1_TO_REDC_2_THRESHOLD 8 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 1442 +#define MU_DIVAPPR_Q_THRESHOLD 1470 +#define MUPI_DIV_QR_THRESHOLD 0 /* always */ +#define MU_BDIV_QR_THRESHOLD 1470 +#define MU_BDIV_Q_THRESHOLD 1652 + +#define POWM_SEC_TABLE 1,16,151,839 + +#define GET_STR_DC_THRESHOLD 7 +#define GET_STR_PRECOMPUTE_THRESHOLD 15 +#define SET_STR_DC_THRESHOLD 406 +#define SET_STR_PRECOMPUTE_THRESHOLD 885 + +#define FAC_DSC_THRESHOLD 179 +#define FAC_ODD_THRESHOLD 53 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 9.10% faster than 3 */ +#define HGCD_THRESHOLD 45 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 2479 +#define GCD_DC_THRESHOLD 321 +#define GCDEXT_DC_THRESHOLD 258 +#define JACOBI_BASE_METHOD 4 /* 15.45% faster than 1 */ + +/* Tuneup completed successfully, took 179422 seconds */ diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/mul_1.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_1.asm new file mode 100644 index 0000000..363f095 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_1.asm @@ -1,0 +1,126 @@ +dnl Power9 mpn_mul_1. + +dnl Copyright 2017, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 ? +C POWER4/PPC970 ? +C POWER5 ? +C POWER6 ? +C POWER7 ? +C POWER8 ? +C POWER9 2.47 + +C TODO +C * Schedule for Power9 pipeline. +C * Unroll 4x if that proves beneficial. +C * This is marginally faster (but much smaller) than ../mul_1.asm. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') +define(`v0', `r6') + +ASM_START() +PROLOGUE(mpn_mul_1c) + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + li r7, 0 +L(ent): ld r11, 0(up) + cmpdi cr6, n, 2 + addi r0, n, -1 C FIXME: postpone + srdi r0, r0, 1 C FIXME: postpone + mtctr r0 C FIXME: postpone + rldicl. r12, n, 0,63 C r0 = n & 3, set cr0 + bne cr0, L(b1) + +L(b0): ld r0, 8(up) + maddld( r9, r11, v0, r7) + maddhdu(r7, r11, v0, r7) + ble cr6, L(2) + ld r12, 16(up) + mulld r8, r0, v0 + mulhdu r5, r0, v0 + addic up, up, 16 + addi rp, rp, -8 + b L(mid) + +L(b1): ld r0, 0(up) + ble cr6, L(1) + ld r12, 8(up) + maddld( r8, r11, v0, r7) + maddhdu(r5, r11, v0, r7) + ld r0, 16(up) + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addic up, up, 24 + bdz L(end) + + ALIGN(16) +L(top): ld r12, 0(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r0, v0 + mulhdu r5, r0, v0 +L(mid): ld r0, 8(up) + std r9, 8(rp) + adde r8, r7, r8 + mulld r9, r12, v0 + mulhdu r7, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(top) + +L(end): std r8, 0(rp) + mulld r8, r0, v0 + adde r9, r5, r9 + mulhdu r5, r0, v0 + std r9, 8(rp) + adde r8, r7, r8 + std r8, 16(rp) + addze r3, r5 + blr + +L(2): mulld r8, r0, v0 + mulhdu r5, r0, v0 + std r9, 0(rp) + addc r8, r7, r8 + std r8, 8(rp) + addze r3, r5 + blr + +L(1): maddld( r8, r0, v0, r7) + std r8, 0(rp) + maddhdu(r3, r0, v0, r7) + blr +EPILOGUE() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/mul_2.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_2.asm new file mode 100644 index 0000000..632b6cb 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_2.asm @@ -1,0 +1,170 @@ +dnl Power9 mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C power9: 1.58 + +C STATUS +C * Not written with any power9 pipeline understanding. +C * The 4x unrolling was not motivated by any timing tests. +C * No local scheduling for performance tweaking has been done. +C * Decrease load scheduling! + +define(`rp', `r3') +define(`up', `r4') +define(`n', `r5') C Note: Reused as scratch +define(`vp', `r6') C Note: Reused for v1 + +define(`v0', `r7') +define(`v1', `r6') + + +ASM_START() +PROLOGUE(mpn_mul_2) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + subfic r0, n, 0 C clear CA + subfo r0, r0, r0 C clear OV and r0 + + cmpdi cr7, n, 4 + + ld v0, 0(vp) + ld v1, 8(vp) + + srdi r10, n, 2 + mtctr r10 + + rldicl. r9, n, 0, 63 + bne cr0, L(bx1) + +L(bx0): rldicl. r9, n, 63, 63 + + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + blt cr7, L(2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + bne cr0, L(b10) + +L(b00): addi up, up, -8 + addi rp, rp, -24 + b L(lo0) + +L(b10): addi up, up, 8 + addi rp, rp, -8 + b L(lo2) + +L(2): addi rp, rp, -8 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj2) + +L(bx1): rldicl. r9, n, 63, 63 + + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + bne cr0, L(b11) + +L(b01): addi rp, rp, -16 + b L(lo1) +L(b11): addi up, up, 16 + blt cr7, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 -> adde + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) + mulld r5, r8, v1 C 1 5 -> addex + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) C 1 5 -> adde + maddhdu(r30, r9, v0, r11) C 2 6 + adde r0, r28, r0 C 8 12 + std r0, 8(rp) + mulld r12, r9, v1 C 2 6 -> addex + mulhdu r11, r9, v1 C 3 7 + addex( r0, r5, r31, 0) C 5 9 13 +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) C 2 6 -> adde + maddhdu(r31, r8, v0, r10) C 3 7 + adde r0, r29, r0 C 5 9 13 + std r0, 16(rp) + mulld r5, r8, v1 C 3 7 -> addex + mulhdu r10, r8, v1 C 4 8 + addex( r0, r12, r30, 0) C 6 10 +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) C 3 7 -> adde + maddhdu(r30, r9, v0, r11) C 4 8 + adde r0, r28, r0 C 6 10 + std r0, 24(rp) + mulld r12, r9, v1 C 4 8 -> addex + mulhdu r11, r9, v1 C 5 9 + addex( r0, r5, r31, 0) C 7 11 + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) C 0 4 + maddhdu(r31, r8, v0, r10) C 1 5 + adde r0, r29, r0 C 7 11 + std r0, 0(rp) C -4 + mulld r5, r8, v1 C 1 5 + mulhdu r10, r8, v1 C 2 6 + addex( r0, r12, r30, 0) C 8 12 +L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2 + maddhdu(r30, r9, v0, r11) C 2 6 -1 + adde r0, r28, r0 C 8 12 -3 + std r0, 8(rp) C -3 + mulld r12, r9, v1 C 2 6 -1 + mulhdu r11, r9, v1 C 3 7 0 = return limb + addex( r0, r5, r31, 0) C 5 9 13 + adde r0, r29, r0 C 5 9 13 -2 + std r0, 16(rp) C -2 + addex( r0, r12, r30, 0) C 6 10 -1 + adde r0, r0, r10 C -1 + std r0, 24(rp) C -1 + li r4, 0 + addze r3, r11 + addex( r3, r3, r4, 0) + +L(ret): ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm new file mode 100644 index 0000000..8f3d322 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm @@ -1,0 +1,415 @@ +dnl Power9 mpn_mul_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Check if (inner) loop alignment affects performance. +C * Could we schedule loads less in addmul_2/mul_2? That would save some regs +C and make the tail code more manageable. +C * Postpone some register saves to main loop. +C * Perhaps write more small operands (3x1, 3x2, 3x3) code. +C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2. +C On the other hand, the current rp,up restore register are useful for OSP. +C * Do OSP. This should save a lot with the current deep addmul_2 pipeline. + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') +define(`vp', `r6') +define(`vn', `r7') + +define(`v0', `r0') +define(`v1', `r7') +define(`rp2', `r24') +define(`up2', `r25') + +ASM_START() +PROLOGUE(mpn_mul_basecase) + cmpdi cr0, un, 2 + bgt cr0, L(un_gt2) + cmpdi cr6, vn, 1 + ld r7, 0(vp) + ld r5, 0(up) + mulld r8, r5, r7 C weight 0 + mulhdu r9, r5, r7 C weight 1 + std r8, 0(rp) + beq cr0, L(2x) + std r9, 8(rp) + blr + ALIGN(16) +L(2x): ld r0, 8(up) + mulld r8, r0, r7 C weight 1 + mulhdu r10, r0, r7 C weight 2 + addc r9, r9, r8 + addze r10, r10 + bne cr6, L(2x2) + std r9, 8(rp) + std r10, 16(rp) + blr + ALIGN(16) +L(2x2): ld r6, 8(vp) + mulld r8, r5, r6 C weight 1 + mulhdu r11, r5, r6 C weight 2 + addc r9, r9, r8 + std r9, 8(rp) + adde r11, r11, r10 + mulld r12, r0, r6 C weight 2 + mulhdu r0, r0, r6 C weight 3 + addze r0, r0 + addc r11, r11, r12 + addze r0, r0 + std r11, 16(rp) + std r0, 24(rp) + blr + +L(un_gt2): + std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + mr rp2, r3 C rp + mr up2, r4 C up + srdi r22, r5, 2 C un + subfic r23, r7, 0 C -vn, clear CA + subfo r0, r0, r0 C clear OV (and r0) + + cmpdi cr6, un, 3 + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + rldicl r0, un, 63, 63 C FIXME: unused for vn = 1 + cmpdi cr5, r0, 0 C FIXME: unused for vn = 1 + + ld v0, 0(vp) + rldicl. r9, vn, 0, 63 + beq cr0, L(vn_evn) + +L(vn_odd): + addi r10, un, -2 + ld r5, 0(up) + srdi r10, r10, 1 + mtctr r10 + bne cr7, L(m1_b1) + +L(m1_b0): + ld r10, 8(up) + mulld r9, r5, v0 + mulhdu r11, r5, v0 + ld r12, 16(up) + mulld r8, r10, v0 + mulhdu r5, r10, v0 + addi rp, rp, -8 + b L(m1_mid) + +L(m1_b1): + ld r12, 8(up) + mulld r8, r5, v0 + mulhdu r5, r5, v0 + ld r10, 16(up) + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi up, up, 8 + beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3} + + ALIGN(16) +L(m1_top): + ld r12, 16(up) + std r8, 0(rp) + adde r9, r5, r9 + mulld r8, r10, v0 + mulhdu r5, r10, v0 +L(m1_mid): + ld r10, 24(up) + std r9, 8(rp) + adde r8, r11, r8 + mulld r9, r12, v0 + mulhdu r11, r12, v0 + addi rp, rp, 16 + addi up, up, 16 + bdnz L(m1_top) + +L(m1_end): + std r8, 0(rp) + mulld r8, r10, v0 + adde r9, r5, r9 + mulhdu r5, r10, v0 + std r9, 8(rp) + adde r8, r11, r8 + std r8, 16(rp) + addze r10, r5 + std r10, 24(rp) + + addi rp2, rp2, 8 + addi vp, vp, 8 + addic. r23, r23, 1 + b L(do_outer) + +L(vn_evn): + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(m2_bx1) + +L(m2_bx0): + ld r8, 0(up) + ld r9, 8(up) + li r11, 0 + mulld r28, r8, v0 + mulhdu r31, r8, v0 + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(m2_b10) + +L(m2_b00): + addi up, up, -8 + addi rp, rp, -24 + b L(m2_lo0) + +L(m2_b10): + addi up, up, 8 + addi rp, rp, -8 + b L(m2_lo2) + +L(m2_bx1): + ld r9, 0(up) + ld r8, 8(up) + li r10, 0 + mulld r29, r9, v0 + mulhdu r30, r9, v0 + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(m2_b11) + +L(m2_b01): + addi rp, rp, -16 + b L(m2_lo1) +L(m2_b11): + addi up, up, 16 + beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done. + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + b L(cj) + +L(outer): + ld v0, 0(vp) + ld v1, 8(vp) + addi r23, r23, 2 + mtctr r22 + bne cr7, L(bx1) + +L(bx0): ld r26, 0(rp2) + ld r8, 0(up2) + ld r11, 8(rp2) + ld r9, 8(up2) + maddld( r28, r8, v0, r26) + maddhdu(r31, r8, v0, r26) + ld r26, 16(rp2) + mulld r5, r8, v1 + mulhdu r10, r8, v1 + li r12, 0 + bne cr5, L(b10) + +L(b00): addi up, up2, -8 + addi rp, rp2, -24 + b L(lo0) + +L(b10): addi up, up2, 8 + addi rp, rp2, -8 + b L(lo2) + +L(bx1): ld r27, 0(rp2) + ld r9, 0(up2) + ld r10, 8(rp2) + ld r8, 8(up2) + maddld( r29, r9, v0, r27) + maddhdu(r30, r9, v0, r27) + ld r27, 16(rp2) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + li r5, 0 + bne cr5, L(b11) + +L(b01): addi up, up2, 0 + addi rp, rp2, -16 + b L(lo1) +L(b11): addi up, up2, 16 + addi rp, rp2, 0 + beq cr6, L(end) C taken means un = 3, vn = 3. We're done. + +L(top): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, v1, r26) + maddhdu(r11, r9, v1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, v0, r10) + maddhdu(r31, r8, v0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, v1, r27) + maddhdu(r10, r8, v1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, v0, r11) + maddhdu(r30, r9, v0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, v1 + mulhdu r11, r9, v1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + + cmpdi cr0, r23, 0 + addi rp2, rp2, 16 + addi vp, vp, 16 +L(do_outer): + bne cr0, L(outer) +L(ret): + ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() diff --git a/gcc/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/gcc/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm new file mode 100644 index 0000000..2d4fa63 100644 --- /dev/null +++ b/gcc/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm @@ -1,0 +1,555 @@ +dnl Power9 mpn_sqr_basecase. + +dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C POWER3/PPC630 - +C POWER4/PPC970 - +C POWER5 - +C POWER6 - +C POWER7 - +C POWER8 - +C POWER9 1.62 + +C TODO +C * Completely separate evn and odd code into two outer loops. Also consider +C unrolling these two outer loops and thereby eliminate all branches. +C * Avoid the reloading of u1 before every loop start. +C * Reduce register usage. +C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde. +C * Consider skewing conditional adjustments to allow mask creation with subfe +C like in the un=3 code. It might streamline the adjustments (or not). + +C INPUT PARAMETERS +define(`rp', `r3') +define(`up', `r4') +define(`un', `r5') + +define(`u0', `r0') +define(`u1', `r7') +define(`rp2', `r24') +define(`up2', `r25') +define(`cy', `r6') + +define(`LSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1U',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 + li cy, 0 + addze cy, cy +') +define(`LSHU1UF',` + addc u0, u0, u0 + add u0, u0, cy + adde u1, u1, u1 +') +define(`LSHU1UHF',` + add u0, u0, u0 + add u0, u0, cy +') +C These are cleverer replacements, but they tend to leave CA set, disturbing +C the main accumulation code! Breaking that false dependency might have a +C positive performance impact. Note that the subfe here results in a mask for +C our adjustments. +define(`xLSHU1U0',` + addc u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 + adde u1, u1, u1 + subfe cy, cy, cy +') +define(`xLSHU1U',` + subfic cy, cy, 0 + adde u0, u0, u0 +') + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + ld r0, 0(up) C n = 1 + mulld r8, r0, r0 C weight 0 + mulhdu r9, r0, r0 C weight 1 + std r8, 0(rp) + cmpdi cr0, un, 2 + bge cr0, L(ge2) + std r9, 8(rp) + blr + +L(ge2): bgt cr0, L(gt2) + ld r6, 8(up) + mulld r10, r6, r6 C u1 * u1 + mulhdu r11, r6, r6 C u1 * u1 + mulld r4, r6, r0 C u1 * u0 + mulhdu r5, r6, r0 C u1 * u0 + addc r4, r4, r4 + adde r5, r5, r5 + addze r11, r11 + addc r9, r9, r4 + adde r10, r10, r5 + addze r11, r11 + std r9, 8(rp) + std r10, 16(rp) + std r11, 24(rp) + blr + +L(gt2): cmpdi cr0, un, 3 + bgt cr0, L(gt3) + std r30, -16(r1) + std r31, -8(r1) + subfo r12, r12, r12 C clear OV (and result register) + ld r8, 8(r4) + mulld r5, r8, r8 C W2 + mulhdu r10, r8, r8 C W3 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, r8 C W3 + addc u0, u0, u0 + adde u1, r8, r8 + subfe r6, r6, r6 C mask + ld r4, 16(r4) C W2 + mulld r12, r8, u0 C W1 u1 x u0 + mulhdu r8, r8, u0 C W2 u1 x u0 + maddld( r31, r4, u0, r11) C W2 + maddhdu(r30, r4, u0, r11) C W3 + andc r6, r4, r6 C W4 + addc r9, r12, r9 C W1 + std r9, 8(rp) C W1 + mulld r9, r4, u1 C W3 + mulhdu r11, r4, u1 C W4 + addex( r5, r5, r8, 0) C W2 + adde r5, r31, r5 C W2 + std r5, 16(rp) C W2 + maddld( r5, r4, r4, r6) C W4 u2^2 + maddhdu(r6, r4, r4, r6) C W5 u2^2 + addex( r9, r9, r30, 0) C W3 + adde r9, r9, r10 C W3 + std r9, 24(rp) C W3 + adde r5, r5, r11 C W4 + addze r6, r6 C W5 + li r8, 0 + addex( r5, r5, r8, 0) C W4 + std r5, 32(rp) C W4 + addex( r6, r6, r8, 0) C W5 + std r6, 40(rp) C W5 + ld r30, -16(r1) + ld r31, -8(r1) + blr + +L(gt3): std r22, -80(r1) + std r23, -72(r1) + std r24, -64(r1) + std r25, -56(r1) + std r26, -48(r1) + std r27, -40(r1) + std r28, -32(r1) + std r29, -24(r1) + std r30, -16(r1) + std r31, -8(r1) + + mr rp2, rp + mr up2, up + addi r22, un, -1 C count for loop FIXME: Adjust + subfo r0, r0, r0 C clear OV (and r0) + rldicl r0, un, 0, 63 C r0 = un & 1 + cmpdi cr7, r0, 0 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + beq cr7, L(m2_evn) +L(m2_odd): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r12, u0, u0 + mulld r5, u1, u1 + mulhdu r10, u1, u1 + + sradi r11, u0, 63 + and r11, r11, u1 + + LSHU1U0 + + ld r8, 8(up2) + ld r9, 16(up2) + mulld r28, r8, u0 C W u1 x u0 + mulhdu r31, r8, u0 C W u1 x u0 + std r23, 0(rp2) + + bne cr0, L(m2_11) +L(m2_01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(m2_lo2) +L(m2_11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(m2_lo0) + +L(m2_evn): + rldicl. r31, r22, 63, 63 C r22 & 2 + mulld r23, u0, u0 + mulhdu r5, u0, u0 + mulld r12, u1, u1 + mulhdu r11, u1, u1 + + sradi r10, u0, 63 + and r10, r10, u1 + + LSHU1U0 + + ld r9, 8(up2) + ld r8, 16(up2) + mulld r29, r9, u0 C W u1 x u0 + mulhdu r30, r9, u0 C W u1 x u0 + std r23, 0(rp2) + + beq cr0, L(m2_10) +L(m2_00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(m2_lo1) +L(m2_10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(m2_end) + +L(m2_top): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo2): + ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) +L(m2_lo1): + ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 16(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + addex( r12, r12, r30, 0) +L(m2_lo0): + ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 24(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(m2_top) + +L(m2_end): + ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + mulld r5, r8, u1 + mulhdu r10, r8, u1 + b L(cj) C jump to addmul_2 tail + +L(outer): + addi up2, up2, 16 + addi rp2, rp2, 32 + + ld u0, 0(up2) + ld u1, 8(up2) + + cmpdi cr5, r22, 4 + srdi r31, r22, 2 + addi r22, r22, -2 + mtctr r31 + + ld r26, 0(rp2) + ld r27, 16(rp2) + + rldicl. r31, r22, 63, 63 C r22 & 2 + beq cr7, L(evn) + +L(odd): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 8(rp2) + + ld r8, -8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1U + + addc r23, r23, r8 + + ld r8, 8(up2) + ld r9, 16(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + ld r26, 24(rp2) + std r23, 0(rp2) C W0 + + bne cr0, L(11) +L(01): + addi up, up2, 16 + addi rp, rp2, 0 + b L(lo2) +L(11): + addi up, up2, 0 + addi rp, rp2, -16 + b L(lo0) + +L(evn): maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + maddld( r12, u1, u1, r27) C W u3^2 + maddhdu(r11, u1, u1, r27) C W u3^2 + ld r27, 8(rp2) + + ld r9, -8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1U + + addc r23, r23, r9 + + ld r9, 8(up2) + ld r8, 16(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + ld r27, 24(rp2) + std r23, 0(rp2) C W0 + + beq cr0, L(10) +L(00): + addi up, up2, 8 + addi rp, rp2, -8 + b L(lo1) +L(10): + addi up, up2, 24 + addi rp, rp2, 8 + ble cr5, L(end) + +L(top): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 24(rp) + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo2): ld r8, 8(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 32(rp) + std r12, 8(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) +L(lo1): ld r9, 16(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + ld r26, 40(rp) + std r5, 16(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) + addex( r12, r12, r30, 0) +L(lo0): ld r8, 24(up) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + ld r27, 48(rp) + std r12, 24(rp) + maddld( r12, r9, u1, r26) + maddhdu(r11, r9, u1, r26) + addex( r5, r5, r31, 0) + addi up, up, 32 + addi rp, rp, 32 + bdnz L(top) + +L(end): ld r9, 0(up) + maddld( r28, r8, u0, r10) + maddhdu(r31, r8, u0, r10) + adde r5, r29, r5 + std r5, 0(rp) + maddld( r5, r8, u1, r27) + maddhdu(r10, r8, u1, r27) +L(cj): addex( r12, r12, r30, 0) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 8(rp) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 16(rp) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 24(rp) + li r4, 0 + addze r5, r11 + addex( r5, r5, r4, 0) + std r5, 32(rp) + bgt cr5, L(outer) + +L(corner): + ld u0, 16(up2) + ld u1, 24(up2) + ld r26, 32(rp2) + bne cr7, L(corner_odd) + +L(corner_evn): + ld r27, 40(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r5, u0, u0, r26) C W u2^2 + mulld r12, u1, u1 C W u3^2 + mulhdu r11, u1, u1 C W u3^2 + + ld r9, 8(up2) + sradi r9, r9, 63 C CAUTION: clobbers CA + and r9, r9, u0 + sradi r10, u0, 63 C CAUTION: clobbers CA + and r10, r10, u1 + + LSHU1UHF + + addc r23, r23, r9 + + ld r9, 24(up2) + maddld( r29, r9, u0, r27) C W u3 x u2 + maddhdu(r30, r9, u0, r27) C W u3 x u2 + std r23, 32(rp2) + adde r5, r29, r5 + std r5, 40(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 C W FIXME can this co? + std r12, 48(rp2) + li r4, 0 + addex( r5, r11, r4, 0) + addze r5, r5 + std r5, 56(rp2) + b L(ret) + +L(corner_odd): + ld r27, 48(rp2) + maddld( r23, u0, u0, r26) C W u2^2 + maddhdu(r12, u0, u0, r26) C W u2^2 + maddld( r5, u1, u1, r27) C W u3^2 + maddhdu(r10, u1, u1, r27) C W u3^2 + ld r26, 40(rp2) + + ld r8, 8(up2) + sradi r8, r8, 63 C CAUTION: clobbers CA + and r8, r8, u0 + sradi r11, u0, 63 C CAUTION: clobbers CA + and r11, r11, u1 + + LSHU1UF + + addc r23, r23, r8 + + ld r8, 24(up2) + ld r9, 32(up2) + maddld( r28, r8, u0, r26) C W u3 x u2 + maddhdu(r31, r8, u0, r26) C W u3 x u2 + std r23, 32(rp2) + maddld( r29, r9, u0, r11) + maddhdu(r30, r9, u0, r11) + adde r12, r28, r12 + std r12, 40(rp2) + mulld r12, r9, u1 + mulhdu r11, r9, u1 + addex( r5, r5, r31, 0) + adde r5, r29, r5 + std r5, 48(rp2) + addex( r12, r12, r30, 0) + adde r12, r12, r10 + std r12, 56(rp2) + mulld r23, r9, r9 C W u2^2 + mulhdu r12, r9, r9 C W u2^2 + adde r23, r23, r11 + addze r12, r12 + sradi r4, r8, 63 C CAUTION: clobbers CA + and r4, r4, r9 + addex( r23, r23, r4, 0) + std r23, 64(rp2) + li r4, 0 + addex( r12, r12, r4, 0) + std r12, 72(rp2) + +L(ret): ld r22, -80(r1) + ld r23, -72(r1) + ld r24, -64(r1) + ld r25, -56(r1) + ld r26, -48(r1) + ld r27, -40(r1) + ld r28, -32(r1) + ld r29, -24(r1) + ld r30, -16(r1) + ld r31, -8(r1) + blr +EPILOGUE() +ASM_END() -- gitore 0.2.2