vllm/compilation/backends.py

import operator

import torch
import torch.fx as fx


def fix_functionalization(graph: fx.Graph):
    """
    Rewrite the graph module to replace the pattern involving
    torch._higher_order_ops.auto_functionalize.auto_functionalized
    with a direct call to the inplace custom op.

    # TODO: check if PyTorch nightly has fixed this issue
    """

    # debug code, if we want to see the graph before the transformation
    # with open("before.py", "w") as f:
    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)

    nodes_to_remove = []

    for node in graph.nodes:
        # Identify the auto_functionalized node
        if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized:  # noqa
            if node.args[0] == torch.ops._C.rotary_embedding.default:
                # manual replace for rotary_embedding

                # Now, collect the arguments
                kwargs = node.kwargs

                query = kwargs['query']
                mm_node = query.args[0].args[0]

                # Create a new call to torch.ops._C.rotary_embedding.default
                with graph.inserting_before(node):
                    # just insert the call to the custom op
                    # NOTE: don't run dead code elimination,
                    # otherwise this op will be removed
                    graph.call_function(torch.ops._C.rotary_embedding.default,
                                        kwargs=kwargs)

                # Remove the auto_functionalized node
                # Since the node may have outputs, we need to handle its users
                # Replace uses of the outputs (getitem nodes) with mm_node
                for user in list(node.users):
                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
                        # Remove the getitem node
                        for getitem_user in list(user.users):
                            if (getitem_user.op == 'call_function'
                                    and getitem_user.target
                                    == torch.ops.aten.slice_scatter.default):
                                # Replace the uses of slice_scatter node
                                # with mm_node
                                getitem_user.replace_all_uses_with(mm_node)
                                nodes_to_remove.append(getitem_user)
                        nodes_to_remove.append(user)
                nodes_to_remove.append(node)

            elif node.args[0] == torch.ops._C.fused_add_rms_norm.default:
                # manual replace for fused_add_rms_norm
                # this is the most effective optimization for llama
                # failing to do this will result in many unnecessary copies

                kwargs = node.kwargs

                input = kwargs['input']
                residual = kwargs['residual']

                # Create a new call to torch.ops._C.rotary_embedding.default
                with graph.inserting_before(node):
                    # just insert the call to the custom op
                    # NOTE: don't run dead code elimination,
                    # otherwise this op will be removed
                    graph.call_function(
                        torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs)

                for user in list(node.users):
                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
                        # Remove the getitem node
                        if user.args[1] == 1:
                            replace_node = input
                        elif user.args[1] == 2:
                            replace_node = residual
                        user.replace_all_uses_with(replace_node)
                        nodes_to_remove.append(user)
                nodes_to_remove.append(node)

            elif node.args[0] == torch.ops._C.rms_norm.default:
                # manual replace for rms_norm

                kwargs = node.kwargs

                input = kwargs['input']
                out = kwargs['out']
                weight = kwargs['weight']
                epsilon = kwargs['epsilon']
                # Create a new call to torch.ops._C.rotary_embedding.default
                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
                with graph.inserting_before(node):
                    # just insert the call to the custom op
                    # NOTE: don't run dead code elimination,
                    # otherwise this op will be removed
                    graph.call_function(
                        torch.ops._C.rms_norm.default,
                        args=(out, input, weight, epsilon),
                    )

                replace_node = out

                for user in list(node.users):
                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
                        user.replace_all_uses_with(replace_node)
                        nodes_to_remove.append(user)
                nodes_to_remove.append(node)

            elif node.args[0] == torch.ops._C.silu_and_mul.default:
                # manual replace for silu_and_mul

                kwargs = node.kwargs

                input = kwargs['input']
                out = kwargs['out']

                # Create a new call to torch.ops._C.rotary_embedding.default
                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
                with graph.inserting_before(node):
                    # just insert the call to the custom op
                    # NOTE: don't run dead code elimination,
                    # otherwise this op will be removed
                    graph.call_function(
                        torch.ops._C.silu_and_mul.default,
                        args=(out, input),
                    )
                replace_node = out

                for user in list(node.users):
                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
                        user.replace_all_uses_with(replace_node)
                        nodes_to_remove.append(user)
                nodes_to_remove.append(node)

    # Remove the nodes all at once
    for node in nodes_to_remove:
        graph.erase_node(node)

    # debug code, if we want to see the graph after the transformation
    # with open("after.py", "w") as f:
    #     print(graph.python_code(root_module="self", verbose=True).src, file=f)


def vllm_backend(graph, example_inputs):
    from torch._inductor import config
    current_config = config.shallow_copy_dict()
    from torch._inductor.compile_fx import compile_fx
    current_config['post_grad_custom_post_pass'] = fix_functionalization
    return compile_fx(graph, example_inputs, config_patches=current_config)
[torch.compile] fix functionalization (#8480) 2024-09-14 09:46:04 -07:00			`import operator`

			`import torch`
			`import torch.fx as fx`


			`def fix_functionalization(graph: fx.Graph):`
			`"""`
			`Rewrite the graph module to replace the pattern involving`
			`torch._higher_order_ops.auto_functionalize.auto_functionalized`
			`with a direct call to the inplace custom op.`

			`# TODO: check if PyTorch nightly has fixed this issue`
			`"""`

			`# debug code, if we want to see the graph before the transformation`
			`# with open("before.py", "w") as f:`
			`# print(graph.python_code(root_module="self", verbose=True).src, file=f)`

			`nodes_to_remove = []`

			`for node in graph.nodes:`
			`# Identify the auto_functionalized node`
			`if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized: # noqa`
			`if node.args[0] == torch.ops._C.rotary_embedding.default:`
			`# manual replace for rotary_embedding`

			`# Now, collect the arguments`
			`kwargs = node.kwargs`

			`query = kwargs['query']`
			`mm_node = query.args[0].args[0]`

			`# Create a new call to torch.ops._C.rotary_embedding.default`
			`with graph.inserting_before(node):`
			`# just insert the call to the custom op`
			`# NOTE: don't run dead code elimination,`
			`# otherwise this op will be removed`
			`graph.call_function(torch.ops._C.rotary_embedding.default,`
			`kwargs=kwargs)`

			`# Remove the auto_functionalized node`
			`# Since the node may have outputs, we need to handle its users`
			`# Replace uses of the outputs (getitem nodes) with mm_node`
			`for user in list(node.users):`
			`if user.op == 'call_function' and user.target == operator.getitem: # noqa`
			`# Remove the getitem node`
			`for getitem_user in list(user.users):`
			`if (getitem_user.op == 'call_function'`
			`and getitem_user.target`
			`== torch.ops.aten.slice_scatter.default):`
			`# Replace the uses of slice_scatter node`
			`# with mm_node`
			`getitem_user.replace_all_uses_with(mm_node)`
			`nodes_to_remove.append(getitem_user)`
			`nodes_to_remove.append(user)`
			`nodes_to_remove.append(node)`

			`elif node.args[0] == torch.ops._C.fused_add_rms_norm.default:`
			`# manual replace for fused_add_rms_norm`
			`# this is the most effective optimization for llama`
			`# failing to do this will result in many unnecessary copies`

			`kwargs = node.kwargs`

			`input = kwargs['input']`
			`residual = kwargs['residual']`

			`# Create a new call to torch.ops._C.rotary_embedding.default`
			`with graph.inserting_before(node):`
			`# just insert the call to the custom op`
			`# NOTE: don't run dead code elimination,`
			`# otherwise this op will be removed`
			`graph.call_function(`
			`torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs)`

			`for user in list(node.users):`
			`if user.op == 'call_function' and user.target == operator.getitem: # noqa`
			`# Remove the getitem node`
			`if user.args[1] == 1:`
			`replace_node = input`
			`elif user.args[1] == 2:`
			`replace_node = residual`
			`user.replace_all_uses_with(replace_node)`
			`nodes_to_remove.append(user)`
			`nodes_to_remove.append(node)`

			`elif node.args[0] == torch.ops._C.rms_norm.default:`
			`# manual replace for rms_norm`

			`kwargs = node.kwargs`

			`input = kwargs['input']`
			`out = kwargs['out']`
			`weight = kwargs['weight']`
			`epsilon = kwargs['epsilon']`
			`# Create a new call to torch.ops._C.rotary_embedding.default`
			# cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
			`with graph.inserting_before(node):`
			`# just insert the call to the custom op`
			`# NOTE: don't run dead code elimination,`
			`# otherwise this op will be removed`
			`graph.call_function(`
			`torch.ops._C.rms_norm.default,`
			`args=(out, input, weight, epsilon),`
			`)`

			`replace_node = out`

			`for user in list(node.users):`
			`if user.op == 'call_function' and user.target == operator.getitem: # noqa`
			`user.replace_all_uses_with(replace_node)`
			`nodes_to_remove.append(user)`
			`nodes_to_remove.append(node)`

			`elif node.args[0] == torch.ops._C.silu_and_mul.default:`
			`# manual replace for silu_and_mul`

			`kwargs = node.kwargs`

			`input = kwargs['input']`
			`out = kwargs['out']`

			`# Create a new call to torch.ops._C.rotary_embedding.default`
			# cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
			`with graph.inserting_before(node):`
			`# just insert the call to the custom op`
			`# NOTE: don't run dead code elimination,`
			`# otherwise this op will be removed`
			`graph.call_function(`
			`torch.ops._C.silu_and_mul.default,`
			`args=(out, input),`
			`)`
			`replace_node = out`

			`for user in list(node.users):`
			`if user.op == 'call_function' and user.target == operator.getitem: # noqa`
			`user.replace_all_uses_with(replace_node)`
			`nodes_to_remove.append(user)`
			`nodes_to_remove.append(node)`

			`# Remove the nodes all at once`
			`for node in nodes_to_remove:`
			`graph.erase_node(node)`

			`# debug code, if we want to see the graph after the transformation`
			`# with open("after.py", "w") as f:`
			`# print(graph.python_code(root_module="self", verbose=True).src, file=f)`


			`def vllm_backend(graph, example_inputs):`
			`from torch._inductor import config`
			`current_config = config.shallow_copy_dict()`
			`from torch._inductor.compile_fx import compile_fx`
			`current_config['post_grad_custom_post_pass'] = fix_functionalization`
			`return compile_fx(graph, example_inputs, config_patches=current_config)`