TP/quantization/weight loading refactor part 1 - Simplify parallel linear logic (#1181)

2023-10-02 15:36:09 -07:00
parent 84e4e37d14
commit ba0bfd40e2
42 changed files with 819 additions and 1547 deletions
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -24,6 +24,7 @@ def _query_server(prompt: str) -> dict:
 def api_server():
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
+    # pylint: disable=consider-using-with
    uvicorn_process = subprocess.Popen([
        sys.executable, "-u",
        str(script_path), "--model", "facebook/opt-125m"
@@ -32,6 +33,7 @@ def api_server():
    uvicorn_process.terminate()


+# pylint: disable=redefined-outer-name, unused-argument
 def test_api_server(api_server):
    """
    Run the API server and test it.
@@ -47,6 +49,7 @@ def test_api_server(api_server):
        prompts = ["Hello world"] * 1
        result = None
        while not result:
+            # pylint: disable=bare-except
            try:
                for result in pool.map(_query_server, prompts):
                    break