scripts/patch_utils.py

"""Patch vLLM's process_weights_after_loading to call model._post_quant_fix()
after all quant methods have set up their attributes."""
import sys

path = sys.argv[1]
with open(path) as f:
    src = f.read()

old = '    if model_config.quantization == "torchao":'
new = '''    # Custom: allow models to run post-quant-init fixes
    if hasattr(model, '_post_quant_fix'):
        model._post_quant_fix()

    if model_config.quantization == "torchao":'''

if old not in src:
    print(f"WARNING: Could not find patch target in {path}")
    sys.exit(1)

src = src.replace(old, new, 1)
with open(path, 'w') as f:
    f.write(src)
print('Patched process_weights_after_loading')
Fix Dockerfile: use external patch script instead of inline Python Docker's parser chokes on multi-line Python in RUN. Moved to scripts/patch_utils.py and COPY + RUN it. 2026-05-18 19:03:57 +00:00			`"""Patch vLLM's process_weights_after_loading to call model._post_quant_fix()`
			`after all quant methods have set up their attributes."""`
			`import sys`

			`path = sys.argv[1]`
			`with open(path) as f:`
			`src = f.read()`

			`old = ' if model_config.quantization == "torchao":'`
			`new = ''' # Custom: allow models to run post-quant-init fixes`
			`if hasattr(model, '_post_quant_fix'):`
			`model._post_quant_fix()`

			`if model_config.quantization == "torchao":'''`

			`if old not in src:`
			`print(f"WARNING: Could not find patch target in {path}")`
			`sys.exit(1)`

			`src = src.replace(old, new, 1)`
			`with open(path, 'w') as f:`
			`f.write(src)`
			`print('Patched process_weights_after_loading')`