changes to benchmarks and script to extract data for plotting

This commit is contained in:
Marc Marcos 2026-05-07 10:08:48 +02:00
parent 15959973d2
commit f639be16a0
10 changed files with 153267 additions and 181 deletions

87
extract_data.py Normal file
View file

@ -0,0 +1,87 @@
#!/usr/bin/env python3
import argparse
import re
import pandas as pd
def parse_cycle_deltas(file_path):
cycle_values = []
csrr_pattern = re.compile(r"csrr\s+\w+,\s+cycle")
value_pattern = re.compile(r"0x[0-9a-fA-F]+$")
with open(file_path, 'r') as f:
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i].strip()
if csrr_pattern.search(line):
if i + 1 < len(lines):
next_line = lines[i+1].strip()
match = value_pattern.search(next_line)
if match:
val = int(match.group(), 16)
cycle_values.append(val)
i += 1
else:
print(f"Warning: Found csrr at line {i} but couldn't find value on line {i+1}")
i += 1
if len(cycle_values) % 2 != 0:
raise ValueError(
f"Hanging CSRR detected! Found {len(cycle_values)} cycle reads. "
"Each 'start' must have a corresponding 'end'."
)
data = []
for j in range(0, len(cycle_values), 2):
start_val = cycle_values[j]
end_val = cycle_values[j+1]
delta = end_val - start_val
data.append({
'start_cycle': start_val,
'end_cycle': end_val,
'delta': delta
})
return pd.DataFrame(data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract cycle deltas from trace files and calculate speedup')
parser.add_argument('with_file', help='Path to the trace file (with optimization)')
parser.add_argument('without_file', help='Path to the trace file (without optimization - baseline)')
args = parser.parse_args()
try:
df_with = parse_cycle_deltas(args.with_file)
df_without = parse_cycle_deltas(args.without_file)
if len(df_with) != len(df_without):
raise ValueError(
f"Mismatch in number of deltas: {args.with_file} has {len(df_with)} deltas, "
f"but {args.without_file} has {len(df_without)} deltas. Cannot pair them."
)
result = pd.DataFrame({
'start_cycle_with': df_with['start_cycle'],
'end_cycle_with': df_with['end_cycle'],
'delta_with': df_with['delta'],
'start_cycle_without': df_without['start_cycle'],
'end_cycle_without': df_without['end_cycle'],
'delta_without': df_without['delta'],
'speedup': df_without['delta'] / df_with['delta']
})
print("Cycle Delta Analysis:")
print(result)
result.to_parquet("result.parquet")
except FileNotFoundError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")

22237
traces/with_czero.txt Normal file

File diff suppressed because it is too large Load diff

22277
traces/with_zfa.txt Normal file

File diff suppressed because it is too large Load diff

20339
traces/with_zfhmin.txt Normal file

File diff suppressed because it is too large Load diff

41965
traces/without_czero.txt Normal file

File diff suppressed because it is too large Load diff

23123
traces/without_zfa.txt Normal file

File diff suppressed because it is too large Load diff

23057
traces/without_zfhmin.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -4,194 +4,204 @@
#define N 10 #define N 10
#define ZFA
static inline uint64_t read_cycles() { static inline uint64_t read_cycles() {
uint64_t start; uint64_t start;
asm volatile ("rdcycle %0" : "=r"(start)); asm volatile ("rdcycle %0" : "=r"(start));
return start; return start;
} }
// Zfa constant table for Single Precision (fli.s)
const float zfa_constants_s[32] = {
-1.0f, -1.0f, 0x1p-16f, 0x1p-15f, // 0 - 3
0x1p-14f, 0x1p-13f, 0x1p-12f, 0x1p-11f, // 4 - 7
0x1p-10f, 0x1p-9f, 0x1p-8f, 0x1p-7f, // 8 - 11
0x1p-6f, 0x1p-5f, 0x1p-4f, 0x1p-3f, // 12 - 15
0.25f, 0.5f, 0.75f, 1.0f, // 16 - 19
1.25f, 1.5f, 1.75f, 2.0f, // 20 - 23
2.5f, 3.0f, 4.0f, 8.0f, // 24 - 27
16.0f, 32.0f, INFINITY, NAN // 28 - 31
};
// Zfa constant table for Double Precision (fli.d)
const double zfa_constants_d[32] = {
-1.0, -1.0f, 0x1p-16, 0x1p-15, // 0 - 3
0x1p-14, 0x1p-13, 0x1p-12, 0x1p-11, // 4 - 7
0x1p-10, 0x1p-9, 0x1p-8, 0x1p-7, // 8 - 11
0x1p-6, 0x1p-5, 0x1p-4, 0x1p-3, // 12 - 15
0.25, 0.5, 0.75, 1.0, // 16 - 19
1.25, 1.5, 1.75, 2.0, // 20 - 23
2.5, 3.0, 4.0, 8.0, // 24 - 27
16.0, 32.0, INFINITY, NAN // 28 - 31
};
int main() { int main() {
// fround.s // fround.s
volatile float a = 3.25f; volatile float a;
volatile float b = round(a); volatile float b;
volatile double c;
volatile double d;
// fround.s
read_cycles();
for (int i = 0; i < N; i++) {
a = 3.25f;
b = round(a);
}
read_cycles();
// fround.d // fround.d
volatile double c = 3.25f; read_cycles();
volatile double d = round(c); for (int i = 0; i < N; i++) {
c = 3.25f;
d = round(c);
}
read_cycles();
int res; int res;
// fleq.s // fleq.s
#ifndef ZFA read_cycles();
asm volatile ( for (int i = 0; i < N; i++) {
"fclass.s t0, %1\n\t" // Classify a #ifndef ZFA
"fclass.s t1, %2\n\t" // Classify b asm volatile (
"or t0, t0, t1\n\t" // Combine classes "fclass.s t0, %1\n\t" // Classify a
"andi t2, t0, 0x200\n\t" // 0x200 is the mask for Quiet NaN "fclass.s t1, %2\n\t" // Classify b
"bnez t2, 1f\n\t" // If qNaN is present, skip to return 0 "or t0, t0, t1\n\t" // Combine classes
"fle.s %0, %1, %2\n\t" // Safe to use signaling comparison "andi t2, t0, 0x200\n\t" // 0x200 is the mask for Quiet NaN
"j 2f\n\t" "bnez t2, 1f\n\t" // If qNaN is present, skip to return 0
"1:\n\t" "fle.s %0, %1, %2\n\t" // Safe to use signaling comparison
"li %0, 0\n\t" // Result is false for NaNs "j 2f\n\t"
"2:\n\t" "1:\n\t"
: "=r" (res) "li %0, 0\n\t" // Result is false for NaNs
: "f" (a), "f" (b) "2:\n\t"
: "t0", "t1", "t2" : "=r" (res)
); : "f" (a), "f" (b)
: "t0", "t1", "t2"
);
#else #else
asm volatile("fleq.s t0, ft0, ft1"); asm volatile("fleq.s t0, ft0, ft1");
#endif #endif
}
read_cycles();
// fleq.d // fleq.d
#ifndef ZFA read_cycles();
asm volatile ( for (int i = 0; i < N; i++) {
"fclass.d t0, %1\n\t" // Classify double a #ifndef ZFA
"fclass.d t1, %2\n\t" // Classify double b asm volatile (
"or t0, t0, t1\n\t" // Combine classification masks "fclass.d t0, %1\n\t" // Classify double a
"andi t2, t0, 0x200\n\t" // 0x200 is the bit for Quiet NaN (qNaN) "fclass.d t1, %2\n\t" // Classify double b
"bnez t2, 1f\n\t" // If a qNaN is detected, skip to return 0 "or t0, t0, t1\n\t" // Combine classification masks
"fle.d %0, %1, %2\n\t" // Signaling comparison: signals on sNaN, result in %0 "andi t2, t0, 0x200\n\t" // 0x200 is the bit for Quiet NaN (qNaN)
"j 2f\n\t" "bnez t2, 1f\n\t" // If a qNaN is detected, skip to return 0
"1:\n\t" "fle.d %0, %1, %2\n\t" // Signaling comparison: signals on sNaN, result in %0
"li %0, 0\n\t" // Quietly return 0 (false) for qNaNs "j 2f\n\t"
"2:\n\t" "1:\n\t"
: "=r" (res) "li %0, 0\n\t" // Quietly return 0 (false) for qNaNs
: "f" (a), "f" (b) "2:\n\t"
: "t0", "t1", "t2" : "=r" (res)
); : "f" (a), "f" (b)
: "t0", "t1", "t2"
);
#else #else
asm volatile ("fleq.d t0, ft0, ft1"); asm volatile ("fleq.d t0, ft0, ft1");
#endif #endif
}
read_cycles();
// fminm.s // fminm.s
float a_fmin = 0.0f, b_fmin = -0.0f; float a_fmin = 0.0f, b_fmin = -0.0f;
float res_fmin; float res_fmin;
#ifndef ZFA read_cycles();
asm volatile ( for (int i = 0; i < N; i++) {
"fclass.s t0, %1\n\t" // Classify a #ifndef ZFA
"fclass.s t1, %2\n\t" // Classify b asm volatile (
"li t2, 0x300\n\t" // Mask for any NaN (0x100 sNaN | 0x200 qNaN) "fclass.s t0, %1\n\t" // Classify a
"and t3, t0, t2\n\t" // t3 = is_nan(a) "fclass.s t1, %2\n\t" // Classify b
"and t4, t1, t2\n\t" // t4 = is_nan(b) "li t2, 0x300\n\t" // Mask for any NaN (0x100 sNaN | 0x200 qNaN)
"bnez t3, 1f\n\t" // If a is NaN, jump to handle it "and t3, t0, t2\n\t" // t3 = is_nan(a)
"bnez t4, 2f\n\t" // If b is NaN, jump to handle it "and t4, t1, t2\n\t" // t4 = is_nan(b)
"fmin.s %0, %1, %2\n\t" // Neither is NaN, use standard min "bnez t3, 1f\n\t" // If a is NaN, jump to handle it
"j 3f\n\t" "bnez t4, 2f\n\t" // If b is NaN, jump to handle it
"1:\n\t" // Case: a is NaN "fmin.s %0, %1, %2\n\t" // Neither is NaN, use standard min
"bnez t4, 4f\n\t" // If b is also NaN, jump to both-NaN case "j 3f\n\t"
"fmv.s %0, %2\n\t" // a is NaN, b is number -> return b "1:\n\t" // Case: a is NaN
"j 3f\n\t" "bnez t4, 4f\n\t" // If b is also NaN, jump to both-NaN case
"2:\n\t" // Case: b is NaN, a is number -> return a "fmv.s %0, %2\n\t" // a is NaN, b is number -> return b
"fmv.s %0, %1\n\t" "j 3f\n\t"
"j 3f\n\t" "2:\n\t" // Case: b is NaN, a is number -> return a
"4:\n\t" // Case: Both are NaNs "fmv.s %0, %1\n\t"
"fmin.s %0, %1, %2\n\t" // Standard min handles both-NaNs correctly "j 3f\n\t"
"3:\n\t" "4:\n\t" // Case: Both are NaNs
: "=f" (res_fmin) "fmin.s %0, %1, %2\n\t" // Standard min handles both-NaNs correctly
: "f" (a_fmin), "f" (b_fmin) "3:\n\t"
: "t0", "t1", "t2", "t3", "t4" : "=f" (res_fmin)
); : "f" (a_fmin), "f" (b_fmin)
#else : "t0", "t1", "t2", "t3", "t4"
asm volatile ("fminm.s ft0, ft1, ft2"); );
#endif #else
asm volatile ("fminm.s ft0, ft1, ft2");
#endif
}
read_cycles();
// fli.s // fli.s
read_cycles(); read_cycles();
volatile float res_fli_s[32]; volatile float res_fli_s[32];
res_fli_s[0] = -1.0f; for (int i = 0; i < N; i++) {
res_fli_s[1] = -1.0f; res_fli_s[0] = -1.0f;
res_fli_s[2] = 0x1p-16f; res_fli_s[1] = -1.0f;
res_fli_s[3] = 0x1p-15f; res_fli_s[2] = 0x1p-16f;
res_fli_s[4] = 0x1p-14f; res_fli_s[3] = 0x1p-15f;
res_fli_s[5] = 0x1p-13f; res_fli_s[4] = 0x1p-14f;
res_fli_s[6] = 0x1p-12f; res_fli_s[5] = 0x1p-13f;
res_fli_s[7] = 0x1p-11f; res_fli_s[6] = 0x1p-12f;
res_fli_s[8] = 0x1p-10f; res_fli_s[7] = 0x1p-11f;
res_fli_s[9] = 0x1p-9f; res_fli_s[8] = 0x1p-10f;
res_fli_s[10] = 0x1p-8f; res_fli_s[9] = 0x1p-9f;
res_fli_s[11] = 0x1p-7f; res_fli_s[10] = 0x1p-8f;
res_fli_s[12] = 0x1p-6f; res_fli_s[11] = 0x1p-7f;
res_fli_s[13] = 0x1p-5f; res_fli_s[12] = 0x1p-6f;
res_fli_s[14] = 0x1p-4f; res_fli_s[13] = 0x1p-5f;
res_fli_s[15] = 0x1p-3f; res_fli_s[14] = 0x1p-4f;
res_fli_s[16] = 0.25f; res_fli_s[15] = 0x1p-3f;
res_fli_s[17] = 0.5f; res_fli_s[16] = 0.25f;
res_fli_s[18] = 0.75f; res_fli_s[17] = 0.5f;
res_fli_s[19] = 1.0f; res_fli_s[18] = 0.75f;
res_fli_s[20] = 1.25f; res_fli_s[19] = 1.0f;
res_fli_s[21] = 1.5f; res_fli_s[20] = 1.25f;
res_fli_s[22] = 1.75f; res_fli_s[21] = 1.5f;
res_fli_s[23] = 2.0f; res_fli_s[22] = 1.75f;
res_fli_s[24] = 2.5f; res_fli_s[23] = 2.0f;
res_fli_s[25] = 3.0f; res_fli_s[24] = 2.5f;
res_fli_s[26] = 4.0f; res_fli_s[25] = 3.0f;
res_fli_s[27] = 8.0f; res_fli_s[26] = 4.0f;
res_fli_s[28] = 16.0f; res_fli_s[27] = 8.0f;
res_fli_s[29] = 32.0f; res_fli_s[28] = 16.0f;
res_fli_s[30] = INFINITY; res_fli_s[29] = 32.0f;
res_fli_s[31] = NAN; res_fli_s[30] = INFINITY;
res_fli_s[31] = NAN;
}
read_cycles();
// fli.d // fli.d
read_cycles();
volatile double res_fli_d[32]; volatile double res_fli_d[32];
res_fli_s[0] = -1.0f; for (int i = 0; i < N; i++) {
res_fli_s[1] = -1.0f; res_fli_s[0] = -1.0f;
res_fli_s[2] = 0x1p-16f; res_fli_s[1] = -1.0f;
res_fli_s[3] = 0x1p-15f; res_fli_s[2] = 0x1p-16f;
res_fli_s[4] = 0x1p-14f; res_fli_s[3] = 0x1p-15f;
res_fli_s[5] = 0x1p-13f; res_fli_s[4] = 0x1p-14f;
res_fli_s[6] = 0x1p-12f; res_fli_s[5] = 0x1p-13f;
res_fli_s[7] = 0x1p-11f; res_fli_s[6] = 0x1p-12f;
res_fli_s[8] = 0x1p-10f; res_fli_s[7] = 0x1p-11f;
res_fli_s[9] = 0x1p-9f; res_fli_s[8] = 0x1p-10f;
res_fli_s[10] = 0x1p-8f; res_fli_s[9] = 0x1p-9f;
res_fli_s[11] = 0x1p-7f; res_fli_s[10] = 0x1p-8f;
res_fli_s[12] = 0x1p-6f; res_fli_s[11] = 0x1p-7f;
res_fli_s[13] = 0x1p-5f; res_fli_s[12] = 0x1p-6f;
res_fli_s[14] = 0x1p-4f; res_fli_s[13] = 0x1p-5f;
res_fli_s[15] = 0x1p-3f; res_fli_s[14] = 0x1p-4f;
res_fli_s[16] = 0.25f; res_fli_s[15] = 0x1p-3f;
res_fli_s[17] = 0.5f; res_fli_s[16] = 0.25f;
res_fli_s[18] = 0.75f; res_fli_s[17] = 0.5f;
res_fli_s[19] = 1.0f; res_fli_s[18] = 0.75f;
res_fli_s[20] = 1.25f; res_fli_s[19] = 1.0f;
res_fli_s[21] = 1.5f; res_fli_s[20] = 1.25f;
res_fli_s[22] = 1.75f; res_fli_s[21] = 1.5f;
res_fli_s[23] = 2.0f; res_fli_s[22] = 1.75f;
res_fli_s[24] = 2.5f; res_fli_s[23] = 2.0f;
res_fli_s[25] = 3.0f; res_fli_s[24] = 2.5f;
res_fli_s[26] = 4.0f; res_fli_s[25] = 3.0f;
res_fli_s[27] = 8.0f; res_fli_s[26] = 4.0f;
res_fli_s[28] = 16.0f; res_fli_s[27] = 8.0f;
res_fli_s[29] = 32.0f; res_fli_s[28] = 16.0f;
res_fli_s[30] = INFINITY; res_fli_s[29] = 32.0f;
res_fli_s[31] = NAN; res_fli_s[30] = INFINITY;
res_fli_s[31] = NAN;
}
read_cycles(); read_cycles();
// fcvtmod.w.d // fcvtmod.w.d

View file

@ -17,7 +17,9 @@ int main() {
volatile double e; volatile double e;
volatile _Float16 g; volatile _Float16 g;
volatile _Float16 a = 3.0f; volatile _Float16 a = 3.25f;
// fcvt.s.h
read_cycles(); read_cycles();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
@ -27,7 +29,7 @@ int main() {
// fcvt.h.s // fcvt.h.s
volatile float c = 3.0f; volatile float c = 3.25f;
read_cycles(); read_cycles();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
@ -39,13 +41,13 @@ int main() {
read_cycles(); read_cycles();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
double e = (double) a; e = (double) a;
} }
read_cycles(); read_cycles();
// fcvt.h.d // fcvt.h.d
volatile double f = 3.0f; volatile double f = 3.25f;
read_cycles(); read_cycles();
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {

View file

@ -1,7 +1,7 @@
#include <stdint.h> #include <stdint.h>
#define N 128 #define N 128
#define ITERATIONS 10 #define ITERATIONS 1
// Static "messy" data to ensure the branch predictor cannot "learn" the pattern // Static "messy" data to ensure the branch predictor cannot "learn" the pattern
static const uint64_t src_a[N] = { static const uint64_t src_a[N] = {
@ -31,38 +31,27 @@ static inline uint64_t read_cycles() {
} }
int main() { int main() {
uint64_t start, end; read_cycles();
// --- Benchmark 1: Trivial czero.nez ---
// Pattern: if (a != 0) return b else return 0
start = read_cycles();
for (int j = 0; j < ITERATIONS; j++) { for (int j = 0; j < ITERATIONS; j++) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
uint64_t a = src_a[i]; uint64_t a = src_a[i];
uint64_t b = src_b[i]; uint64_t b = src_b[i];
// GCC will use czero.eqz here to zero out b if a is 0
results[i] = (a != 0) ? b : 0; results[i] = (a != 0) ? b : 0;
} }
} }
end = read_cycles(); read_cycles();
// Record (end - start) for Zicond enabled vs disabled
// --- Benchmark 2: Logic AND (czero with complex condition) --- read_cycles();
// Pattern: if (a != 0 AND b > 500) return b else return 0
start = read_cycles();
for (int j = 0; j < ITERATIONS; j++) { for (int j = 0; j < ITERATIONS; j++) {
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
uint64_t a = src_a[i]; uint64_t a = src_a[i];
uint64_t b = src_b[i]; uint64_t b = src_b[i];
// Uses 'and' to combine conditions, then 'czero'
if (a != 0 && b > 500) { results[i] = (a != 0) ? b : 0;
results[i] = b;
} else {
results[i] = 0;
}
} }
} }
end = read_cycles(); read_cycles();
return 0; return 0;
} }