diff --git a/zfa_micro/zfa.c b/zfa_micro/zfa.c new file mode 100644 index 0000000..d513c04 --- /dev/null +++ b/zfa_micro/zfa.c @@ -0,0 +1,199 @@ +#include +#include +#include + +#define N 10 + +static inline uint64_t read_cycles() { + uint64_t start; + asm volatile ("rdcycle %0" : "=r"(start)); + return start; +} + +// Zfa constant table for Single Precision (fli.s) +const float zfa_constants_s[32] = { + -1.0f, -1.0f, 0x1p-16f, 0x1p-15f, // 0 - 3 + 0x1p-14f, 0x1p-13f, 0x1p-12f, 0x1p-11f, // 4 - 7 + 0x1p-10f, 0x1p-9f, 0x1p-8f, 0x1p-7f, // 8 - 11 + 0x1p-6f, 0x1p-5f, 0x1p-4f, 0x1p-3f, // 12 - 15 + 0.25f, 0.5f, 0.75f, 1.0f, // 16 - 19 + 1.25f, 1.5f, 1.75f, 2.0f, // 20 - 23 + 2.5f, 3.0f, 4.0f, 8.0f, // 24 - 27 + 16.0f, 32.0f, INFINITY, NAN // 28 - 31 +}; + +// Zfa constant table for Double Precision (fli.d) +const double zfa_constants_d[32] = { + -1.0, -1.0f, 0x1p-16, 0x1p-15, // 0 - 3 + 0x1p-14, 0x1p-13, 0x1p-12, 0x1p-11, // 4 - 7 + 0x1p-10, 0x1p-9, 0x1p-8, 0x1p-7, // 8 - 11 + 0x1p-6, 0x1p-5, 0x1p-4, 0x1p-3, // 12 - 15 + 0.25, 0.5, 0.75, 1.0, // 16 - 19 + 1.25, 1.5, 1.75, 2.0, // 20 - 23 + 2.5, 3.0, 4.0, 8.0, // 24 - 27 + 16.0, 32.0, INFINITY, NAN // 28 - 31 +}; + +int main() { + // fround.s + volatile float a = 3.25f; + volatile float b = round(a); + + // fround.d + volatile double c = 3.25f; + volatile double d = round(c); + int res; + + // fleq.s + + #ifndef ZFA + asm volatile ( + "fclass.s t0, %1\n\t" // Classify a + "fclass.s t1, %2\n\t" // Classify b + "or t0, t0, t1\n\t" // Combine classes + "andi t2, t0, 0x200\n\t" // 0x200 is the mask for Quiet NaN + "bnez t2, 1f\n\t" // If qNaN is present, skip to return 0 + "fle.s %0, %1, %2\n\t" // Safe to use signaling comparison + "j 2f\n\t" + "1:\n\t" + "li %0, 0\n\t" // Result is false for NaNs + "2:\n\t" + : "=r" (res) + : "f" (a), "f" (b) + : "t0", "t1", "t2" + ); + + #else + asm volatile("fleq.s t0, ft0, ft1"); + #endif + + // fleq.d + #ifndef ZFA + asm volatile ( + "fclass.d t0, %1\n\t" // Classify double a + "fclass.d t1, %2\n\t" // Classify double b + "or t0, t0, t1\n\t" // Combine classification masks + "andi t2, t0, 0x200\n\t" // 0x200 is the bit for Quiet NaN (qNaN) + "bnez t2, 1f\n\t" // If a qNaN is detected, skip to return 0 + "fle.d %0, %1, %2\n\t" // Signaling comparison: signals on sNaN, result in %0 + "j 2f\n\t" + "1:\n\t" + "li %0, 0\n\t" // Quietly return 0 (false) for qNaNs + "2:\n\t" + : "=r" (res) + : "f" (a), "f" (b) + : "t0", "t1", "t2" + ); + + #else + asm volatile ("fleq.d t0, ft0, ft1"); + #endif + + // fminm.s + float a_fmin = 0.0f, b_fmin = -0.0f; + float res_fmin; + + #ifndef ZFA + asm volatile ( + "fclass.s t0, %1\n\t" // Classify a + "fclass.s t1, %2\n\t" // Classify b + "li t2, 0x300\n\t" // Mask for any NaN (0x100 sNaN | 0x200 qNaN) + "and t3, t0, t2\n\t" // t3 = is_nan(a) + "and t4, t1, t2\n\t" // t4 = is_nan(b) + "bnez t3, 1f\n\t" // If a is NaN, jump to handle it + "bnez t4, 2f\n\t" // If b is NaN, jump to handle it + "fmin.s %0, %1, %2\n\t" // Neither is NaN, use standard min + "j 3f\n\t" + "1:\n\t" // Case: a is NaN + "bnez t4, 4f\n\t" // If b is also NaN, jump to both-NaN case + "fmv.s %0, %2\n\t" // a is NaN, b is number -> return b + "j 3f\n\t" + "2:\n\t" // Case: b is NaN, a is number -> return a + "fmv.s %0, %1\n\t" + "j 3f\n\t" + "4:\n\t" // Case: Both are NaNs + "fmin.s %0, %1, %2\n\t" // Standard min handles both-NaNs correctly + "3:\n\t" + : "=f" (res_fmin) + : "f" (a_fmin), "f" (b_fmin) + : "t0", "t1", "t2", "t3", "t4" + ); + #else + asm volatile ("fminm.s ft0, ft1, ft2"); + #endif + // fli.s + + read_cycles(); + volatile float res_fli_s[32]; + res_fli_s[0] = -1.0f; + res_fli_s[1] = -1.0f; + res_fli_s[2] = 0x1p-16f; + res_fli_s[3] = 0x1p-15f; + res_fli_s[4] = 0x1p-14f; + res_fli_s[5] = 0x1p-13f; + res_fli_s[6] = 0x1p-12f; + res_fli_s[7] = 0x1p-11f; + res_fli_s[8] = 0x1p-10f; + res_fli_s[9] = 0x1p-9f; + res_fli_s[10] = 0x1p-8f; + res_fli_s[11] = 0x1p-7f; + res_fli_s[12] = 0x1p-6f; + res_fli_s[13] = 0x1p-5f; + res_fli_s[14] = 0x1p-4f; + res_fli_s[15] = 0x1p-3f; + res_fli_s[16] = 0.25f; + res_fli_s[17] = 0.5f; + res_fli_s[18] = 0.75f; + res_fli_s[19] = 1.0f; + res_fli_s[20] = 1.25f; + res_fli_s[21] = 1.5f; + res_fli_s[22] = 1.75f; + res_fli_s[23] = 2.0f; + res_fli_s[24] = 2.5f; + res_fli_s[25] = 3.0f; + res_fli_s[26] = 4.0f; + res_fli_s[27] = 8.0f; + res_fli_s[28] = 16.0f; + res_fli_s[29] = 32.0f; + res_fli_s[30] = INFINITY; + res_fli_s[31] = NAN; + + // fli.d + volatile double res_fli_d[32]; + res_fli_s[0] = -1.0f; + res_fli_s[1] = -1.0f; + res_fli_s[2] = 0x1p-16f; + res_fli_s[3] = 0x1p-15f; + res_fli_s[4] = 0x1p-14f; + res_fli_s[5] = 0x1p-13f; + res_fli_s[6] = 0x1p-12f; + res_fli_s[7] = 0x1p-11f; + res_fli_s[8] = 0x1p-10f; + res_fli_s[9] = 0x1p-9f; + res_fli_s[10] = 0x1p-8f; + res_fli_s[11] = 0x1p-7f; + res_fli_s[12] = 0x1p-6f; + res_fli_s[13] = 0x1p-5f; + res_fli_s[14] = 0x1p-4f; + res_fli_s[15] = 0x1p-3f; + res_fli_s[16] = 0.25f; + res_fli_s[17] = 0.5f; + res_fli_s[18] = 0.75f; + res_fli_s[19] = 1.0f; + res_fli_s[20] = 1.25f; + res_fli_s[21] = 1.5f; + res_fli_s[22] = 1.75f; + res_fli_s[23] = 2.0f; + res_fli_s[24] = 2.5f; + res_fli_s[25] = 3.0f; + res_fli_s[26] = 4.0f; + res_fli_s[27] = 8.0f; + res_fli_s[28] = 16.0f; + res_fli_s[29] = 32.0f; + res_fli_s[30] = INFINITY; + res_fli_s[31] = NAN; + read_cycles(); + + // fcvtmod.w.d + +} diff --git a/zicond_micro/zicond.c b/zicond_micro/zicond.c new file mode 100644 index 0000000..b73e24d --- /dev/null +++ b/zicond_micro/zicond.c @@ -0,0 +1,68 @@ +#include + +#define N 128 +#define ITERATIONS 10 + +// Static "messy" data to ensure the branch predictor cannot "learn" the pattern +static const uint64_t src_a[N] = { + 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, + 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, + 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1 +}; + +static const uint64_t src_b[N] = { + 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, + 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, + 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, + 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, + 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, + 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, + 970, 980, 990, 1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100, 1110, 1120, + 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210, 1220, 1230, 1240, 1250, 1260, 1270, 1280 +}; + +volatile uint64_t results[N]; + +static inline uint64_t read_cycles() { + uint64_t val; + asm volatile ("rdcycle %0" : "=r"(val)); + return val; +} + +int main() { + uint64_t start, end; + + // --- Benchmark 1: Trivial czero.nez --- + // Pattern: if (a != 0) return b else return 0 + start = read_cycles(); + for (int j = 0; j < ITERATIONS; j++) { + for (int i = 0; i < N; i++) { + uint64_t a = src_a[i]; + uint64_t b = src_b[i]; + // GCC will use czero.eqz here to zero out b if a is 0 + results[i] = (a != 0) ? b : 0; + } + } + end = read_cycles(); + // Record (end - start) for Zicond enabled vs disabled + + // --- Benchmark 2: Logic AND (czero with complex condition) --- + // Pattern: if (a != 0 AND b > 500) return b else return 0 + start = read_cycles(); + for (int j = 0; j < ITERATIONS; j++) { + for (int i = 0; i < N; i++) { + uint64_t a = src_a[i]; + uint64_t b = src_b[i]; + // Uses 'and' to combine conditions, then 'czero' + if (a != 0 && b > 500) { + results[i] = b; + } else { + results[i] = 0; + } + } + } + end = read_cycles(); + + return 0; +} diff --git a/zvfhmin_micro/zvfhmin.c b/zvfhmin_micro/zvfhmin.c new file mode 100644 index 0000000..b27543c --- /dev/null +++ b/zvfhmin_micro/zvfhmin.c @@ -0,0 +1,39 @@ +#include +#include +#include + +#include + +#define N 32 + +// Use 'aligned' to help the autovectorizer +_Float16 a[N] __attribute__((aligned(16))); +float b[N] __attribute__((aligned(16))); + +static inline uint64_t read_cycles() { + uint64_t start; + asm volatile ("rdcycle %0" : "=r"(start)); + return start; +} + +void benchmark() { + // 1. Widening: _Float16 -> float + uint64_t t0 = read_cycles(); + for (int i = 0; i < N; i++) { + b[i] = (float)a[i]; + } + uint64_t t1 = read_cycles(); + + // 2. Narrowing: float -> _Float16 + uint64_t t2 = read_cycles(); + for (int i = 0; i < N; i++) { + a[i] = (_Float16)b[i]; + } + uint64_t t3 = read_cycles(); + + // In a real app, print (t1-t0) and (t3-t2) +} + +int main() { + benchmark(); +}