#include #include #include #include #define N 32 // Use 'aligned' to help the autovectorizer _Float16 a[N] __attribute__((aligned(16))); float b[N] __attribute__((aligned(16))); static inline uint64_t read_cycles() { uint64_t start; asm volatile ("rdcycle %0" : "=r"(start)); return start; } void benchmark() { // 1. Widening: _Float16 -> float uint64_t t0 = read_cycles(); for (int i = 0; i < N; i++) { b[i] = (float)a[i]; } uint64_t t1 = read_cycles(); // 2. Narrowing: float -> _Float16 uint64_t t2 = read_cycles(); for (int i = 0; i < N; i++) { a[i] = (_Float16)b[i]; } uint64_t t3 = read_cycles(); // In a real app, print (t1-t0) and (t3-t2) } int main() { benchmark(); }