changes to benchmarks and script to extract data for plotting

2026-05-07 10:08:48 +02:00 · 2026-05-07 10:08:48 +02:00 · f639be16a0
commit f639be16a0
parent 15959973d2
10 changed files with 153267 additions and 181 deletions
--- a/extract_data.py
+++ b/extract_data.py
@ -0,0 +1,87 @@
 #!/usr/bin/env python3
 import argparse
 import re
 import pandas as pd
 def parse_cycle_deltas(file_path):
    cycle_values = []
    csrr_pattern = re.compile(r"csrr\s+\w+,\s+cycle")
    value_pattern = re.compile(r"0x[0-9a-fA-F]+$")
    with open(file_path, 'r') as f:
        lines = f.readlines()
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if csrr_pattern.search(line):
            if i + 1 < len(lines):
                next_line = lines[i+1].strip()
                match = value_pattern.search(next_line)
                if match:
                    val = int(match.group(), 16)
                    cycle_values.append(val)
                    i += 1
                else:
                    print(f"Warning: Found csrr at line {i} but couldn't find value on line {i+1}")
        i += 1
    if len(cycle_values) % 2 != 0:
        raise ValueError(
            f"Hanging CSRR detected! Found {len(cycle_values)} cycle reads. "
            "Each 'start' must have a corresponding 'end'."
        )
    data = []
    for j in range(0, len(cycle_values), 2):
        start_val = cycle_values[j]
        end_val = cycle_values[j+1]
        delta = end_val - start_val
        data.append({
            'start_cycle': start_val,
            'end_cycle': end_val,
            'delta': delta
        })
    return pd.DataFrame(data)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Extract cycle deltas from trace files and calculate speedup')
    parser.add_argument('with_file', help='Path to the trace file (with optimization)')
    parser.add_argument('without_file', help='Path to the trace file (without optimization - baseline)')
    args = parser.parse_args()
    try:
        df_with = parse_cycle_deltas(args.with_file)
        df_without = parse_cycle_deltas(args.without_file)
        if len(df_with) != len(df_without):
            raise ValueError(
                f"Mismatch in number of deltas: {args.with_file} has {len(df_with)} deltas, "
                f"but {args.without_file} has {len(df_without)} deltas. Cannot pair them."
            )
        result = pd.DataFrame({
            'start_cycle_with': df_with['start_cycle'],
            'end_cycle_with': df_with['end_cycle'],
            'delta_with': df_with['delta'],
            'start_cycle_without': df_without['start_cycle'],
            'end_cycle_without': df_without['end_cycle'],
            'delta_without': df_without['delta'],
            'speedup': df_without['delta'] / df_with['delta']
        })
        print("Cycle Delta Analysis:")
        print(result)
        result.to_parquet("result.parquet")
    except FileNotFoundError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
--- a/traces/with_czero.txt
+++ b/traces/with_czero.txt
--- a/traces/with_zfa.txt
+++ b/traces/with_zfa.txt
--- a/traces/with_zfhmin.txt
+++ b/traces/with_zfhmin.txt
--- a/traces/without_czero.txt
+++ b/traces/without_czero.txt
--- a/traces/without_zfa.txt
+++ b/traces/without_zfa.txt
--- a/traces/without_zfhmin.txt
+++ b/traces/without_zfhmin.txt
--- a/zfa_micro/zfa.c
+++ b/zfa_micro/zfa.c
@ -4,194 +4,204 @@
 #define N 10
 #define ZFA
 static inline uint64_t read_cycles() {
  uint64_t start;
  asm volatile ("rdcycle %0" : "=r"(start));
  return start;
 }
 // Zfa constant table for Single Precision (fli.s)
 const float zfa_constants_s[32] = {
  -1.0f,        -1.0f,      0x1p-16f,     0x1p-15f,     // 0 - 3
  0x1p-14f,     0x1p-13f,     0x1p-12f,     0x1p-11f,     // 4 - 7
  0x1p-10f,     0x1p-9f,      0x1p-8f,      0x1p-7f,      // 8 - 11
  0x1p-6f,      0x1p-5f,      0x1p-4f,      0x1p-3f,      // 12 - 15
  0.25f,        0.5f,         0.75f,        1.0f,         // 16 - 19
  1.25f,        1.5f,         1.75f,        2.0f,         // 20 - 23
  2.5f,         3.0f,         4.0f,         8.0f,         // 24 - 27
  16.0f,        32.0f,        INFINITY,     NAN           // 28 - 31
 };
 // Zfa constant table for Double Precision (fli.d)
 const double zfa_constants_d[32] = {
  -1.0,         -1.0f,      0x1p-16,      0x1p-15,      // 0 - 3
  0x1p-14,      0x1p-13,      0x1p-12,      0x1p-11,      // 4 - 7
  0x1p-10,      0x1p-9,       0x1p-8,       0x1p-7,       // 8 - 11
  0x1p-6,       0x1p-5,       0x1p-4,       0x1p-3,       // 12 - 15
  0.25,         0.5,          0.75,         1.0,          // 16 - 19
  1.25,         1.5,          1.75,         2.0,          // 20 - 23
  2.5,          3.0,          4.0,          8.0,          // 24 - 27
  16.0,         32.0,         INFINITY,     NAN           // 28 - 31
 };
 int main() {
  // fround.s
-  volatile float a = 3.25f;
+  volatile float a;
-  volatile float b = round(a);
+  volatile float b;
  volatile double c;
  volatile double d;
  // fround.s
  read_cycles();
  for (int i = 0; i < N; i++) {
    a = 3.25f;
    b = round(a);
  }
  read_cycles();
  // fround.d
-  volatile double c = 3.25f;
+  read_cycles();
-  volatile double d = round(c);
+  for (int i = 0; i < N; i++) {
    c = 3.25f;
    d = round(c);
  }
  read_cycles();
  int res;
  // fleq.s
-  #ifndef ZFA
+  read_cycles();
-    asm volatile (
+  for (int i = 0; i < N; i++) {
-                    "fclass.s t0, %1\n\t"     // Classify a
+    #ifndef ZFA
-                    "fclass.s t1, %2\n\t"     // Classify b
+        asm volatile (
-                    "or       t0, t0, t1\n\t" // Combine classes
+                        "fclass.s t0, %1\n\t"     // Classify a
-                    "andi     t2, t0, 0x200\n\t" // 0x200 is the mask for Quiet NaN
+                        "fclass.s t1, %2\n\t"     // Classify b
-                    "bnez     t2, 1f\n\t"     // If qNaN is present, skip to return 0
+                        "or       t0, t0, t1\n\t" // Combine classes
-                    "fle.s    %0, %1, %2\n\t" // Safe to use signaling comparison
+                        "andi     t2, t0, 0x200\n\t" // 0x200 is the mask for Quiet NaN
-                    "j        2f\n\t"
+                        "bnez     t2, 1f\n\t"     // If qNaN is present, skip to return 0
-                    "1:\n\t"
+                        "fle.s    %0, %1, %2\n\t" // Safe to use signaling comparison
-                    "li       %0, 0\n\t"      // Result is false for NaNs
+                        "j        2f\n\t"
-                    "2:\n\t"
+                        "1:\n\t"
-                    : "=r" (res)
+                        "li       %0, 0\n\t"      // Result is false for NaNs
-                    : "f" (a), "f" (b)
+                        "2:\n\t"
-                    : "t0", "t1", "t2"
+                        : "=r" (res)
-                    );
+                        : "f" (a), "f" (b)
                        : "t0", "t1", "t2"
                        );
-  #else
+    #else
-    asm volatile("fleq.s t0, ft0, ft1");
+        asm volatile("fleq.s t0, ft0, ft1");
-  #endif
+    #endif
  }
  read_cycles();
  // fleq.d
-  #ifndef ZFA
+  read_cycles();
-    asm volatile (
+  for (int i = 0; i < N; i++) {
-                    "fclass.d t0, %1\n\t"        // Classify double a
+    #ifndef ZFA
-                    "fclass.d t1, %2\n\t"        // Classify double b
+        asm volatile (
-                    "or       t0, t0, t1\n\t"    // Combine classification masks
+                        "fclass.d t0, %1\n\t"        // Classify double a
-                    "andi     t2, t0, 0x200\n\t" // 0x200 is the bit for Quiet NaN (qNaN)
+                        "fclass.d t1, %2\n\t"        // Classify double b
-                    "bnez     t2, 1f\n\t"        // If a qNaN is detected, skip to return 0
+                        "or       t0, t0, t1\n\t"    // Combine classification masks
-                    "fle.d    %0, %1, %2\n\t"    // Signaling comparison: signals on sNaN, result in %0
+                        "andi     t2, t0, 0x200\n\t" // 0x200 is the bit for Quiet NaN (qNaN)
-                    "j        2f\n\t"
+                        "bnez     t2, 1f\n\t"        // If a qNaN is detected, skip to return 0
-                    "1:\n\t"
+                        "fle.d    %0, %1, %2\n\t"    // Signaling comparison: signals on sNaN, result in %0
-                    "li       %0, 0\n\t"         // Quietly return 0 (false) for qNaNs
+                        "j        2f\n\t"
-                    "2:\n\t"
+                        "1:\n\t"
-                    : "=r" (res)
+                        "li       %0, 0\n\t"         // Quietly return 0 (false) for qNaNs
-                    : "f" (a), "f" (b)
+                        "2:\n\t"
-                    : "t0", "t1", "t2"
+                        : "=r" (res)
-                    );
+                        : "f" (a), "f" (b)
                        : "t0", "t1", "t2"
                        );
-  #else
+    #else
-    asm volatile ("fleq.d t0, ft0, ft1");
+        asm volatile ("fleq.d t0, ft0, ft1");
-  #endif
+    #endif
  }
  read_cycles();
  // fminm.s
  float a_fmin = 0.0f, b_fmin = -0.0f;
  float res_fmin;
-  #ifndef ZFA
+  read_cycles();
-    asm volatile (
+  for (int i = 0; i < N; i++) {
-                    "fclass.s t0, %1\n\t"      // Classify a
+    #ifndef ZFA
-                    "fclass.s t1, %2\n\t"      // Classify b
+        asm volatile (
-                    "li       t2, 0x300\n\t"   // Mask for any NaN (0x100 sNaN | 0x200 qNaN)
+                        "fclass.s t0, %1\n\t"      // Classify a
-                    "and      t3, t0, t2\n\t"  // t3 = is_nan(a)
+                        "fclass.s t1, %2\n\t"      // Classify b
-                    "and      t4, t1, t2\n\t"  // t4 = is_nan(b)
+                        "li       t2, 0x300\n\t"   // Mask for any NaN (0x100 sNaN | 0x200 qNaN)
-                    "bnez     t3, 1f\n\t"      // If a is NaN, jump to handle it
+                        "and      t3, t0, t2\n\t"  // t3 = is_nan(a)
-                    "bnez     t4, 2f\n\t"      // If b is NaN, jump to handle it
+                        "and      t4, t1, t2\n\t"  // t4 = is_nan(b)
-                    "fmin.s   %0, %1, %2\n\t"  // Neither is NaN, use standard min
+                        "bnez     t3, 1f\n\t"      // If a is NaN, jump to handle it
-                    "j        3f\n\t"
+                        "bnez     t4, 2f\n\t"      // If b is NaN, jump to handle it
-                    "1:\n\t"                   // Case: a is NaN
+                        "fmin.s   %0, %1, %2\n\t"  // Neither is NaN, use standard min
-                    "bnez     t4, 4f\n\t"      // If b is also NaN, jump to both-NaN case
+                        "j        3f\n\t"
-                    "fmv.s    %0, %2\n\t"      // a is NaN, b is number -> return b
+                        "1:\n\t"                   // Case: a is NaN
-                    "j        3f\n\t"
+                        "bnez     t4, 4f\n\t"      // If b is also NaN, jump to both-NaN case
-                    "2:\n\t"                   // Case: b is NaN, a is number -> return a
+                        "fmv.s    %0, %2\n\t"      // a is NaN, b is number -> return b
-                    "fmv.s    %0, %1\n\t"
+                        "j        3f\n\t"
-                    "j        3f\n\t"
+                        "2:\n\t"                   // Case: b is NaN, a is number -> return a
-                    "4:\n\t"                   // Case: Both are NaNs
+                        "fmv.s    %0, %1\n\t"
-                    "fmin.s   %0, %1, %2\n\t"  // Standard min handles both-NaNs correctly
+                        "j        3f\n\t"
-                    "3:\n\t"
+                        "4:\n\t"                   // Case: Both are NaNs
-                    : "=f" (res_fmin)
+                        "fmin.s   %0, %1, %2\n\t"  // Standard min handles both-NaNs correctly
-                    : "f" (a_fmin), "f" (b_fmin)
+                        "3:\n\t"
-                    : "t0", "t1", "t2", "t3", "t4"
+                        : "=f" (res_fmin)
-                    );
+                        : "f" (a_fmin), "f" (b_fmin)
-  #else
+                        : "t0", "t1", "t2", "t3", "t4"
-    asm volatile ("fminm.s ft0, ft1, ft2");
+                        );
-  #endif
+    #else
        asm volatile ("fminm.s ft0, ft1, ft2");
    #endif
  }
  read_cycles();
  // fli.s
  read_cycles();
  volatile float res_fli_s[32];
-  res_fli_s[0]  = -1.0f;
+  for (int i = 0; i < N; i++) {
-  res_fli_s[1]  = -1.0f;
+    res_fli_s[0]  = -1.0f;
-  res_fli_s[2]  = 0x1p-16f;
+    res_fli_s[1]  = -1.0f;
-  res_fli_s[3]  = 0x1p-15f;
+    res_fli_s[2]  = 0x1p-16f;
-  res_fli_s[4]  = 0x1p-14f;
+    res_fli_s[3]  = 0x1p-15f;
-  res_fli_s[5]  = 0x1p-13f;
+    res_fli_s[4]  = 0x1p-14f;
-  res_fli_s[6]  = 0x1p-12f;
+    res_fli_s[5]  = 0x1p-13f;
-  res_fli_s[7]  = 0x1p-11f;
+    res_fli_s[6]  = 0x1p-12f;
-  res_fli_s[8]  = 0x1p-10f;
+    res_fli_s[7]  = 0x1p-11f;
-  res_fli_s[9]  = 0x1p-9f;
+    res_fli_s[8]  = 0x1p-10f;
-  res_fli_s[10] = 0x1p-8f;
+    res_fli_s[9]  = 0x1p-9f;
-  res_fli_s[11] = 0x1p-7f;
+    res_fli_s[10] = 0x1p-8f;
-  res_fli_s[12] = 0x1p-6f;
+    res_fli_s[11] = 0x1p-7f;
-  res_fli_s[13] = 0x1p-5f;
+    res_fli_s[12] = 0x1p-6f;
-  res_fli_s[14] = 0x1p-4f;
+    res_fli_s[13] = 0x1p-5f;
-  res_fli_s[15] = 0x1p-3f;
+    res_fli_s[14] = 0x1p-4f;
-  res_fli_s[16] = 0.25f;
+    res_fli_s[15] = 0x1p-3f;
-  res_fli_s[17] = 0.5f;
+    res_fli_s[16] = 0.25f;
-  res_fli_s[18] = 0.75f;
+    res_fli_s[17] = 0.5f;
-  res_fli_s[19] = 1.0f;
+    res_fli_s[18] = 0.75f;
-  res_fli_s[20] = 1.25f;
+    res_fli_s[19] = 1.0f;
-  res_fli_s[21] = 1.5f;
+    res_fli_s[20] = 1.25f;
-  res_fli_s[22] = 1.75f;
+    res_fli_s[21] = 1.5f;
-  res_fli_s[23] = 2.0f;
+    res_fli_s[22] = 1.75f;
-  res_fli_s[24] = 2.5f;
+    res_fli_s[23] = 2.0f;
-  res_fli_s[25] = 3.0f;
+    res_fli_s[24] = 2.5f;
-  res_fli_s[26] = 4.0f;
+    res_fli_s[25] = 3.0f;
-  res_fli_s[27] = 8.0f;
+    res_fli_s[26] = 4.0f;
-  res_fli_s[28] = 16.0f;
+    res_fli_s[27] = 8.0f;
-  res_fli_s[29] = 32.0f;
+    res_fli_s[28] = 16.0f;
-  res_fli_s[30] = INFINITY;
+    res_fli_s[29] = 32.0f;
-  res_fli_s[31] = NAN;
+    res_fli_s[30] = INFINITY;
    res_fli_s[31] = NAN;
  }
  read_cycles();
  // fli.d
  read_cycles();
  volatile double res_fli_d[32];
-  res_fli_s[0]  = -1.0f;
+  for (int i = 0; i < N; i++) {
-  res_fli_s[1]  = -1.0f;
+    res_fli_s[0]  = -1.0f;
-  res_fli_s[2]  = 0x1p-16f;
+    res_fli_s[1]  = -1.0f;
-  res_fli_s[3]  = 0x1p-15f;
+    res_fli_s[2]  = 0x1p-16f;
-  res_fli_s[4]  = 0x1p-14f;
+    res_fli_s[3]  = 0x1p-15f;
-  res_fli_s[5]  = 0x1p-13f;
+    res_fli_s[4]  = 0x1p-14f;
-  res_fli_s[6]  = 0x1p-12f;
+    res_fli_s[5]  = 0x1p-13f;
-  res_fli_s[7]  = 0x1p-11f;
+    res_fli_s[6]  = 0x1p-12f;
-  res_fli_s[8]  = 0x1p-10f;
+    res_fli_s[7]  = 0x1p-11f;
-  res_fli_s[9]  = 0x1p-9f;
+    res_fli_s[8]  = 0x1p-10f;
-  res_fli_s[10] = 0x1p-8f;
+    res_fli_s[9]  = 0x1p-9f;
-  res_fli_s[11] = 0x1p-7f;
+    res_fli_s[10] = 0x1p-8f;
-  res_fli_s[12] = 0x1p-6f;
+    res_fli_s[11] = 0x1p-7f;
-  res_fli_s[13] = 0x1p-5f;
+    res_fli_s[12] = 0x1p-6f;
-  res_fli_s[14] = 0x1p-4f;
+    res_fli_s[13] = 0x1p-5f;
-  res_fli_s[15] = 0x1p-3f;
+    res_fli_s[14] = 0x1p-4f;
-  res_fli_s[16] = 0.25f;
+    res_fli_s[15] = 0x1p-3f;
-  res_fli_s[17] = 0.5f;
+    res_fli_s[16] = 0.25f;
-  res_fli_s[18] = 0.75f;
+    res_fli_s[17] = 0.5f;
-  res_fli_s[19] = 1.0f;
+    res_fli_s[18] = 0.75f;
-  res_fli_s[20] = 1.25f;
+    res_fli_s[19] = 1.0f;
-  res_fli_s[21] = 1.5f;
+    res_fli_s[20] = 1.25f;
-  res_fli_s[22] = 1.75f;
+    res_fli_s[21] = 1.5f;
-  res_fli_s[23] = 2.0f;
+    res_fli_s[22] = 1.75f;
-  res_fli_s[24] = 2.5f;
+    res_fli_s[23] = 2.0f;
-  res_fli_s[25] = 3.0f;
+    res_fli_s[24] = 2.5f;
-  res_fli_s[26] = 4.0f;
+    res_fli_s[25] = 3.0f;
-  res_fli_s[27] = 8.0f;
+    res_fli_s[26] = 4.0f;
-  res_fli_s[28] = 16.0f;
+    res_fli_s[27] = 8.0f;
-  res_fli_s[29] = 32.0f;
+    res_fli_s[28] = 16.0f;
-  res_fli_s[30] = INFINITY;
+    res_fli_s[29] = 32.0f;
-  res_fli_s[31] = NAN;
+    res_fli_s[30] = INFINITY;
    res_fli_s[31] = NAN;
  }
  read_cycles();
  // fcvtmod.w.d
--- a/zfhmin_micro/zfhmin.c
+++ b/zfhmin_micro/zfhmin.c
@ -17,7 +17,9 @@ int main() {
  volatile double e;
  volatile _Float16 g;
-  volatile _Float16 a = 3.0f;
+  volatile _Float16 a = 3.25f;
  // fcvt.s.h
  read_cycles();
  for (int i = 0; i < N; i++) {
@ -27,7 +29,7 @@ int main() {
  // fcvt.h.s
-  volatile float c = 3.0f;
+  volatile float c = 3.25f;
  read_cycles();
  for (int i = 0; i < N; i++) {
@ -39,13 +41,13 @@ int main() {
  read_cycles();
  for (int i = 0; i < N; i++) {
-    double e = (double) a;
+    e = (double) a;
  }
  read_cycles();
  // fcvt.h.d
-  volatile double f = 3.0f;
+  volatile double f = 3.25f;
  read_cycles();
  for (int i = 0; i < N; i++) {
--- a/zicond_micro/zicond.c
+++ b/zicond_micro/zicond.c
@ -1,7 +1,7 @@
 #include <stdint.h>
 #define N 128
-#define ITERATIONS 10
+#define ITERATIONS 1
 // Static "messy" data to ensure the branch predictor cannot "learn" the pattern
 static const uint64_t src_a[N] = {
@ -31,38 +31,27 @@ static inline uint64_t read_cycles() {
 }
 int main() {
-    uint64_t start, end;
+    read_cycles();
    // --- Benchmark 1: Trivial czero.nez ---
    // Pattern: if (a != 0) return b else return 0
    start = read_cycles();
    for (int j = 0; j < ITERATIONS; j++) {
        for (int i = 0; i < N; i++) {
            uint64_t a = src_a[i];
            uint64_t b = src_b[i];
-            // GCC will use czero.eqz here to zero out b if a is 0
+
            results[i] = (a != 0) ? b : 0;
        }
    }
-    end = read_cycles();
+    read_cycles();
    // Record (end - start) for Zicond enabled vs disabled
-    // --- Benchmark 2: Logic AND (czero with complex condition) ---
+    read_cycles();
    // Pattern: if (a != 0 AND b > 500) return b else return 0
    start = read_cycles();
    for (int j = 0; j < ITERATIONS; j++) {
        for (int i = 0; i < N; i++) {
            uint64_t a = src_a[i];
            uint64_t b = src_b[i];
-            // Uses 'and' to combine conditions, then 'czero'
+
-            if (a != 0 && b > 500) {
+            results[i] = (a != 0) ? b : 0;
                results[i] = b;
            } else {
                results[i] = 0;
            }
        }
    }
-    end = read_cycles();
+    read_cycles();
    return 0;
 }