#!/usr/bin/env python3 """ Compute expected O for linear pattern P_ij = i*128 + j. Use same random seed as test. """ import torch import math torch.manual_seed(42) hd = 256 n_kv = 128 scale_softmax = 1.0 / math.sqrt(hd) # Generate random Q,K,V as in test q = torch.randn(128, hd, dtype=torch.bfloat16, device='cuda') k = torch.randn(128, hd, dtype=torch.bfloat16, device='cuda') v = torch.randn(128, hd, dtype=torch.bfloat16, device='cuda') # Compute reference P via softmax scores = (q @ k.mT) * scale_softmax p_ref = torch.softmax(scores, dim=-1) # 128x128 # Linear pattern P p_linear = torch.zeros(128, 128, dtype=torch.bfloat16, device='cuda') for i in range(128): for j in range(128): p_linear[i,j] = i*128 + j # Compute O for both patterns o_ref = p_ref @ v o_linear = p_linear @ v print("Reference O shape:", o_ref.shape) print("Linear O shape:", o_linear.shape) print("\nFirst row, first 4 cols:") print("O_ref[0,:4] =", o_ref[0,:4,0].tolist()) print("O_lin[0,:4] =", o_linear[0,:4,0].tolist()) # Compute expected scaling if mapping correct # Kernel output for hd=256: out[0,:4]=[0.029296875, 0.0164794921875, -0.029541015625, 0.02294921875] kernel_out = [0.029296875, 0.0164794921875, -0.029541015625, 0.02294921875] print("\nKernel out[0,:4] =", kernel_out) # Compare with linear pattern print("\nDifference kernel vs linear:") for i in range(4): diff = kernel_out[i] - o_linear[0,i,0].item() print(f"col {i}: kernel {kernel_out[i]:.6f} vs linear {o_linear[0,i,0].item():.6f} diff={diff:.6f}") # Compute cosine similarity kernel_tensor = torch.tensor(kernel_out, dtype=torch.float32) linear_tensor = o_linear[0,:4,0].float() cos = torch.cosine_similarity(kernel_tensor, linear_tensor, dim=0).item() print(f"\nCosine similarity (first 4 cols): {cos:.6f}") # Also compute expected O for P=1.0 pattern p_one = torch.ones(128,128, dtype=torch.bfloat16, device='cuda') o_one = p_one @ v print("\nP=1.0 pattern O[0,:4] =", o_one[0,:4,0].tolist())