From f52eedbdce2720faf4b02f84b38c74c13767bc29 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Tue, 2 Jun 2026 04:10:39 +0000 Subject: [PATCH] Add production-value tests: ALL tests use Pro config (61L, HD=512, 384 experts, HCA=128, 1M context) Previous unit tests used toy values (HD=64-256, T=16, small N). These tests validate the actual production configuration: - FMHA: HD=512, 128 Q heads, N=128/2048/8192 - Compression: CSA T=4096, HCA T=16384, full 1M context - NVFP4: production weight shapes (q_a, kv, wo_a, gate) - MoE: 384 experts, top-6, 3072 intermediate - mHC: 4 streams, 61 layers, residual bounded, doubly-stochastic - Router: 384 experts hash + noaux-TC - Memory budget: 1M context KV pool, 8-GPU weight distribution --- tests/production_values_test.py | 475 ++++++++++++++++++++++++++++++++ 1 file changed, 475 insertions(+) create mode 100644 tests/production_values_test.py diff --git a/tests/production_values_test.py b/tests/production_values_test.py new file mode 100644 index 00000000..a2b633f0 --- /dev/null +++ b/tests/production_values_test.py @@ -0,0 +1,475 @@ +#!/usr/bin/env python3 +"""Production-value tests for DSV4 Pro kernel stack. + +ALL tests use Pro config values: + - 61 layers, 7168 hidden, 128 query heads, HD=512 + - 384 routed experts, top-6, 3072 intermediate + - HCA ratio=128, CSA ratio=4, CSA top-k=1024 + - 4-way mHC, 20 Sinkhorn iters + - SWA window=128 + +This file is the ONLY acceptable place for non-production test values. +If a test needs a smaller value for memory/time, it must be marked +with a comment explaining why and what the production value should be. +""" +import math +import torch +import pytest + +# ─── Production Pro config ─────────────────────────────────────────── +PRO = dict( + num_layers=61, + hidden_size=7168, + num_query_heads=128, + head_dim=512, + rope_dim=64, + query_compression_dim=1536, + csa_compression_ratio=4, + csa_top_k=1024, + indexer_num_heads=64, + indexer_head_dim=128, + hca_compression_ratio=128, + sliding_window=128, + num_output_groups=16, + output_group_dim=1024, + num_routed_experts=384, + num_shared_experts=1, + num_experts_per_tok=6, + moe_intermediate_size=3072, + num_hash_routing_layers=3, + routed_scaling_factor=2.5, + n_hc=4, + sinkhorn_iters=20, + rms_norm_eps=1e-6, +) + +DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" + + +# ─── 1. FMHA at HD=512, production head counts ────────────────────── + +class TestFMHAProduction: + """FMHA tests at Pro config: HD=512, 128 query heads, various KV lengths.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_fmha_hd512_decode_short(self): + """Decode (T=1) with 128 Q heads, HD=512, N=128 (1 SWA window).""" + n_q = PRO["num_query_heads"] + hd = PRO["head_dim"] + N = PRO["sliding_window"] + T = 1 + scale = 1.0 / math.sqrt(hd) + + q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE) + k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + + # Reference: PyTorch SDPA + q_4d = q.reshape(1, n_q, T, hd) + k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd) + v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N) + ref = torch.nn.functional.scaled_dot_product_attention( + q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale + ).bfloat16() # (1, n_q, T, hd) + + from dsv4.layers.attention import _run_production_fmha + prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "swa", "swa") + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"FMHA HD=512 decode short: cos={cos:.6f}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_fmha_hd512_decode_medium(self): + """Decode (T=1) with HD=512, N=2048 (compressed tokens after HCA).""" + n_q = PRO["num_query_heads"] + hd = PRO["head_dim"] + N = 2048 # typical compressed KV length after HCA at moderate context + T = 1 + scale = 1.0 / math.sqrt(hd) + + q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE) + k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + + q_4d = q.reshape(1, n_q, T, hd) + k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd) + v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N) + ref = torch.nn.functional.scaled_dot_product_attention( + q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale + ).bfloat16() + + from dsv4.layers.attention import _run_production_fmha + prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca") + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"FMHA HD=512 decode medium: cos={cos:.6f}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_fmha_hd512_decode_long(self): + """Decode (T=1) with HD=512, N=8192 (compressed tokens at long context).""" + n_q = PRO["num_query_heads"] + hd = PRO["head_dim"] + N = 8192 # compressed KV after HCA at ~1M context (1M/128=7812) + T = 1 + scale = 1.0 / math.sqrt(hd) + + q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE) + k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + + q_4d = q.reshape(1, n_q, T, hd) + k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd) + v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N) + ref = torch.nn.functional.scaled_dot_product_attention( + q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale + ).bfloat16() + + from dsv4.layers.attention import _run_production_fmha + prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "hca", "hca") + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"FMHA HD=512 decode long: cos={cos:.6f}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + @pytest.mark.parametrize("N", [512, 1024, 4096]) + def test_fmha_hd512_csa_topk(self, N): + """Decode with CSA top-k=1024 selected tokens, HD=512.""" + n_q = PRO["num_query_heads"] + hd = PRO["head_dim"] + T = 1 + scale = 1.0 / math.sqrt(hd) + + q = torch.randn(T, n_q, hd, dtype=torch.bfloat16, device=DEVICE) + k = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + v = torch.randn(N, hd, dtype=torch.bfloat16, device=DEVICE) + + q_4d = q.reshape(1, n_q, T, hd) + k_4d = k.reshape(1, 1, N, hd).expand(1, n_q, N, hd) + v_4d = v.reshape(1, 1, hd, N).expand(1, n_q, hd, N) + ref = torch.nn.functional.scaled_dot_product_attention( + q_4d.float(), k_4d.float(), v_4d.float().transpose(-2, -1), scale=scale + ).bfloat16() + + from dsv4.layers.attention import _run_production_fmha + prod = _run_production_fmha(q, k.unsqueeze(0), v.unsqueeze(0), n_q, hd, T, N, scale, DEVICE, 0, "csa", "csa") + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"FMHA HD=512 CSA N={N}: cos={cos:.6f}" + + +# ─── 2. Compression at production scale ───────────────────────────── + +class TestCompressionProduction: + """CSA and HCA compression at production token counts and ratios.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_csa_compress_production_scale(self): + """CSA: ratio=4, T=4096 tokens → 1024 compressed, HD=512.""" + hd = PRO["head_dim"] + m = PRO["csa_compression_ratio"] # 4 + T = PRO["csa_top_k"] * m # 4096 + n_blocks = T // m + + kv = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE) * 3.0 + gate = torch.randn(T, 2 * hd, dtype=torch.float32, device=DEVICE) + + # Reference: block-wise softmax + weighted sum + Ca = kv[:, :hd].reshape(n_blocks, m, hd) + Cb = kv[:, hd:].reshape(n_blocks, m, hd) + Ga = gate[:, :hd].reshape(n_blocks, m, hd) + Gb = gate[:, hd:].reshape(n_blocks, m, hd) + + ref_a = torch.zeros(n_blocks, hd, device=DEVICE) + ref_b = torch.zeros(n_blocks, hd, device=DEVICE) + for b in range(n_blocks): + sa = torch.softmax(Ga[b], dim=0) + sb = torch.softmax(Gb[b], dim=0) + ref_a[b] = (sa * Ca[b]).sum(0) + ref_b[b] = (sb * Cb[b]).sum(0) + ref = torch.cat([ref_a, ref_b], dim=-1) + + from dsv4.kernels.compressor.production_compress import csa_compress_production + prod = csa_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m) + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"CSA compress production scale: cos={cos:.6f}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_hca_compress_production_scale(self): + """HCA: ratio=128, T=16384 tokens → 128 compressed, HD=512. + + This is the 1M context enabler: 1M tokens / 128 = 7812 compressed tokens. + We test a single HCA block here. + """ + hd = PRO["head_dim"] + m = PRO["hca_compression_ratio"] # 128 + T = m * 128 # 16384 tokens → 128 compressed + n_blocks = T // m + + kv = torch.randn(T, hd, dtype=torch.float32, device=DEVICE) * 3.0 + gate = torch.randn(T, hd, dtype=torch.float32, device=DEVICE) + + ref = [] + for b in range(n_blocks): + block_kv = kv[b*m:(b+1)*m] + block_gate = gate[b*m:(b+1)*m] + probs = torch.softmax(block_gate, dim=0) + ref.append((probs * block_kv).sum(0)) + ref = torch.stack(ref) + + from dsv4.kernels.compressor.production_compress import hca_compress_production + prod = hca_compress_production(kv.bfloat16(), gate.bfloat16(), None, None, m=m) + + cos = torch.nn.functional.cosine_similarity(ref.flatten().float(), prod.flatten().float(), dim=0).item() + assert cos > 0.999, f"HCA compress production scale: cos={cos:.6f}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_hca_compress_1m_context(self): + """HCA at full 1M context scale: 1M tokens, ratio=128 → 7812 compressed. + + This tests that the kernel handles the full production token count + without OOM or numerical issues. + """ + hd = PRO["head_dim"] + m = PRO["hca_compression_ratio"] # 128 + T = 1_000_000 # 1M context + n_blocks = T // m # 7812 + + # Use smaller data to avoid OOM on test — but validate at correct n_blocks + # The kernel processes blocks independently, so correctness at n_blocks=7812 + # with random data proves the indexing is correct + kv = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE) * 3.0 + gate = torch.randn(T, hd, dtype=torch.bfloat16, device=DEVICE) + + from dsv4.kernels.compressor.production_compress import hca_compress_production + prod = hca_compress_production(kv, gate, None, None, m=m) + + assert prod.shape[0] == n_blocks, f"Expected {n_blocks} compressed, got {prod.shape[0]}" + assert prod.shape[1] == hd, f"Expected hd={hd}, got {prod.shape[1]}" + assert torch.isfinite(prod).all(), "HCA compress 1M: NaN/Inf in output" + + +# ─── 3. NVFP4 GEMM at production weight shapes ───────────────────── + +class TestNVFP4GEMMProduction: + """Test NVFP4 linear layers at Pro model weight shapes.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + @pytest.mark.parametrize("name,in_dim,out_dim", [ + ("q_a_proj", 7168, 1536), # hidden → query compression + ("kv_proj", 7168, 2*512), # hidden → KV (1 KV head for GQA) + ("wo_a_proj", 16*1024, 7168), # output groups → hidden + ("gate_proj", 7168, 3072*384), # MoE gate: hidden → 384 experts (for dense router) + ]) + def test_nvfp4_linear_production_shapes(self, name, in_dim, out_dim): + """Test Nvfp4Linear at actual Pro model weight dimensions.""" + from dsv4.layers.linear import Nvfp4Linear + + # kv_proj in GQA has fewer heads — the actual out_dim varies per layer + # but the kernel must handle all shapes + lin = Nvfp4Linear(in_dim, out_dim, max_num_tokens=8192, device=DEVICE) + + x = torch.randn(1, in_dim, dtype=torch.bfloat16, device=DEVICE) * 2.0 + out = lin(x) + assert out.shape == (1, out_dim), f"Expected (1, {out_dim}), got {out.shape}" + assert torch.isfinite(out).all(), f"NaN/Inf in {name} output" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_nvfp4_moe_384_experts(self): + """Test Nvfp4MoE with 384 routed experts, top-6, 3072 intermediate.""" + from dsv4.layers.ffn import Nvfp4MoE + + H = PRO["hidden_size"] + E = PRO["num_routed_experts"] + K = PRO["num_experts_per_tok"] + I = PRO["moe_intermediate_size"] + + moe = Nvfp4MoE(num_experts=E, hidden_size=H, intermediate_size=I, top_k=K, device=DEVICE) + + x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0 + topk_ids = torch.randint(0, E, (1, K), device=DEVICE, dtype=torch.int32) + topk_weights = torch.softmax(torch.randn(1, K, device=DEVICE), dim=-1) + + out = moe.run(x, topk_ids, topk_weights) + assert out.shape == (1, H), f"Expected (1, {H}), got {out.shape}" + assert torch.isfinite(out).all(), "NaN/Inf in MoE output" + + +# ─── 4. mHC at production depth ───────────────────────────────────── + +class TestMHCProduction: + """Test multi-head hyper-connection with 4 streams, 61 layers, Sinkhorn.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_mhc_61_layers_residual_bounded(self): + """Run mHC through 61 layers and verify residual stays bounded. + + Production mHC should keep |X| bounded. If it grows unbounded, + the Sinkhorn normalization is wrong. + """ + from dsv4.layers.mhc import mHCLayer + + H = PRO["hidden_size"] + n_hc = PRO["n_hc"] + n_layers = PRO["num_layers"] + eps = PRO["rms_norm_eps"] + + # Simulate 61 layers of mHC with random weights + x = torch.randn(n_hc, H, dtype=torch.bfloat16, device=DEVICE) * 0.5 + residual_norms = [x.abs().max().item()] + + for li in range(n_layers): + layer = mHCLayer(H, n_hc, device=DEVICE) + # Fake sub-layer output + sub_out = torch.randn(H, dtype=torch.bfloat16, device=DEVICE) * 0.5 + x = layer(sub_out, x) + max_val = x.abs().max().item() + residual_norms.append(max_val) + + # mHC with proper Sinkhorn should keep residuals bounded + # Allow generous bound (1000) but flag if growing monotonically + final_norm = residual_norms[-1] + max_norm = max(residual_norms) + + print(f"Residual norms: L0={residual_norms[0]:.1f} ... L61={final_norm:.1f} max={max_norm:.1f}") + + # The residual should NOT grow by >100x from input + growth = max_norm / (residual_norms[0] + 1e-6) + assert growth < 100, f"mHC residual grew {growth:.1f}x over 61 layers — Sinkhorn broken?" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_mhc_sinkhorn_doubly_stochastic(self): + """Verify Sinkhorn produces doubly-stochastic matrices at production scale.""" + n_hc = PRO["n_hc"] + iters = PRO["sinkhorn_iters"] + B = 16 # Production batch dimension + + comb = torch.randn(B, n_hc, n_hc, dtype=torch.bfloat16, device=DEVICE) * 2.0 + + # Sinkhorn: softmax → alternate row/col norm + P = torch.softmax(comb.float(), dim=-1) + 1e-6 + for _ in range(iters): + P = P / P.sum(dim=-1, keepdim=True) # row norm + P = P / P.sum(dim=-2, keepdim=True) # col norm + + row_sums = P.sum(dim=-1) + col_sums = P.sum(dim=-2) + + assert torch.allclose(row_sums, torch.ones_like(row_sums), atol=1e-2), \ + f"Row sums not ~1.0: {row_sums.mean().item():.4f}" + assert torch.allclose(col_sums, torch.ones_like(col_sums), atol=1e-2), \ + f"Col sums not ~1.0: {col_sums.mean().item():.4f}" + + +# ─── 5. Router at production scale ────────────────────────────────── + +class TestRouterProduction: + """Test router with 384 experts, hash routing for L0-2, noaux_tc for L3+.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_hash_router_384_experts(self): + """Hash routing (layers 0-2) with 384 experts, top-6.""" + from dsv4.layers.router import HashRouter + + E = PRO["num_routed_experts"] + K = PRO["num_experts_per_tok"] + H = PRO["hidden_size"] + + router = HashRouter(num_experts=E, top_k=K, hidden_size=H, device=DEVICE) + token_ids = torch.tensor([1, 50, 100, 500, 9999, 50000], dtype=torch.int32, device=DEVICE) + x = torch.randn(len(token_ids), H, dtype=torch.bfloat16, device=DEVICE) * 2.0 + + topk_ids, topk_weights = router(x, token_ids) + assert topk_ids.shape == (len(token_ids), K) + assert (topk_ids >= 0).all() and (topk_ids < E).all(), \ + f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_noaux_tc_router_384_experts(self): + """Noaux-TC routing (layers 3+) with 384 experts, top-6.""" + from dsv4.layers.router import Router + + E = PRO["num_routed_experts"] + K = PRO["num_experts_per_tok"] + H = PRO["hidden_size"] + + router = Router(hidden_size=H, num_experts=E, top_k=K, device=DEVICE, is_hash=False) + x = torch.randn(1, H, dtype=torch.bfloat16, device=DEVICE) * 2.0 + + topk_ids, topk_weights = router.run(x) + assert topk_ids.shape == (1, K) + assert (topk_ids >= 0).all() and (topk_ids < E).all(), \ + f"Expert IDs out of range: min={topk_ids.min()}, max={topk_ids.max()}" + + +# ─── 6. Memory budget at production scale ─────────────────────────── + +class TestMemoryBudget: + """Verify memory usage stays within bounds for 1M context.""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_kv_pool_memory_1m_context(self): + """Calculate and validate KV pool memory at 1M context. + + At 1M tokens with HCA ratio=128: + - HCA compressed: 1M / 128 = 7812 tokens × HD=512 × 2 (K+V) × 2 bytes + - SWA window: 128 tokens × HD=512 × 2 × 2 bytes + - CSA top-k: 1024 tokens × HD=512 × 2 × 2 bytes + + Total per layer per batch ≈ (7812 + 128 + 1024) × 512 × 2 × 2 ≈ 18.4 MB + × 61 layers = 1.1 GB per batch — feasible on B200 192GB + """ + hca_compressed = 1_000_000 // PRO["hca_compression_ratio"] # 7812 + swa_tokens = PRO["sliding_window"] # 128 + csa_tokens = PRO["csa_top_k"] # 1024 + hd = PRO["head_dim"] + bytes_per_val = 2 # BF16 + + total_tokens = hca_compressed + swa_tokens + csa_tokens + bytes_per_layer = total_tokens * hd * 2 * bytes_per_val # K+V + total_bytes = bytes_per_layer * PRO["num_layers"] + total_gb = total_bytes / 1e9 + + # Without compression: 1M × 512 × 2 × 2 × 61 = 125 GB — IMPOSSIBLE + uncompressed_gb = (1_000_000 * hd * 2 * bytes_per_val * PRO["num_layers"]) / 1e9 + + print(f"Compressed KV pool: {total_gb:.2f} GB") + print(f"Uncompressed KV pool: {uncompressed_gb:.2f} GB") + print(f"Compression saves: {uncompressed_gb - total_gb:.2f} GB ({(1 - total_gb/uncompressed_gb)*100:.1f}%)") + + # Verify compression achieves the claimed ratio + assert total_gb < 5.0, f"Compressed KV too large: {total_gb:.2f} GB — compression broken?" + assert total_gb < uncompressed_gb * 0.02, "Compression ratio worse than expected" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="no GPU") + def test_weight_memory_8gpu(self): + """Validate weight distribution across 8 GPUs at Pro scale. + + Pro model weight memory (NVFP4): + - 61 layers × (attention + MoE + shared expert + mHC + norms) + - NVFP4: 2 bits per param → ~0.25 bytes per param + - Total params: ~1.8T → ~450 GB in NVFP4 + - Across 8 GPUs: ~56 GB per GPU — fits in B200 192GB HBM + """ + # Rough estimate: Pro has ~1.8T params (384 experts × 7168 × 3072 × 2 × 61 layers) + expert_params = PRO["num_routed_experts"] * PRO["hidden_size"] * PRO["moe_intermediate_size"] * 2 # gate+up + expert_params += PRO["num_routed_experts"] * PRO["moe_intermediate_size"] * PRO["hidden_size"] # down + shared_params = PRO["hidden_size"] * PRO["moe_intermediate_size"] * 3 # gate+up+down + attn_params = PRO["hidden_size"] * (PRO["query_compression_dim"] + 2 * PRO["head_dim"] + PRO["num_output_groups"] * PRO["output_group_dim"]) + mhc_params = PRO["n_hc"] * PRO["n_hc"] * 3 + PRO["n_hc"] * 2 # comb + pre + post + + total_params = (expert_params + shared_params + attn_params + mhc_params) * PRO["num_layers"] + total_params += PRO["hidden_size"] * PRO["vocab_size"] # embedding + lm_head + + nvfp4_bytes = total_params / 4 # 2 bits per param + per_gpu_bytes = nvfp4_bytes / 8 + per_gpu_gb = per_gpu_bytes / 1e9 + + print(f"Total params: {total_params/1e12:.2f}T") + print(f"NVFP4 weight memory: {nvfp4_bytes/1e9:.2f} GB total, {per_gpu_gb:.2f} GB per GPU") + + assert per_gpu_gb < 100, f"Per-GPU weight memory too large: {per_gpu_gb:.2f} GB" + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"])