Fix setup issues and add test script

Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
2025-07-16 10:03:25 -07:00 · 2022-10-21 13:23:10 -05:00 · 2022-10-21 13:23:10 -05:00 · c6be12979e
commit c6be12979e
parent 2a91018792
8 changed files with 262 additions and 41 deletions
--- a/launch.sh
+++ b/launch.sh
@ -7,17 +7,5 @@ if [ ! -f .env ]; then
 fi
 source .env
 export NUM_GPUS=${NUM_GPUS}
 export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
 # if model name starts with "py-", it means we're dealing with the python backend.
 if [[ $(echo "$MODEL" | cut -c1-3) == "py-" ]]; then
    export MODEL_DIR="${MODEL_DIR}"/"${MODEL}" #/py_model"
 else
    export MODEL_DIR="${MODEL_DIR}"/"${MODEL}-${NUM_GPUS}gpu"
 fi
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 # On newer versions, docker-compose is docker compose
 docker compose up -d --remove-orphans || docker-compose up -d --remove-orphans
--- a/python_backend/init_model.py
+++ b/python_backend/init_model.py
@ -22,7 +22,7 @@ args = parser.parse_args()
 # Step1: Make model directory
-model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.model_name}/py-model/1"))
+model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.org_name}-{args.model_name}/py-model/1"))
 model_dir_path.mkdir(parents=True, exist_ok=True)
 # Step 2: copy model.py
@ -41,4 +41,4 @@ config = template.substitute(
 )
 with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
    f.write(config)
-    print(f"Config written to")
+    print(f"Config written to {os.path.abspath(f.name)}")
--- a/python_backend/model.py
+++ b/python_backend/model.py
@ -25,15 +25,16 @@ class TritonPythonModel:
        def get_bool(x):
            return model_config["parameters"][x]["string_value"].lower() in ["1", "true"]
-        is_half = get_bool("use_half")
+        is_half = get_bool("use_half") and torch.cuda.is_available()
        # This will make inference marginally slower, but will allow bigger models to fit in GPU
-        int8 = get_bool("use_int8")
+        int8 = get_bool("use_int8") and torch.cuda.is_available()
-        auto_device_map = get_bool("use_auto_device_map")
+        auto_device_map = get_bool("use_auto_device_map") and torch.cuda.is_available()
        print("Cuda available?", torch.cuda.is_available())
        print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
-            torch_dtype=torch.float16 if is_half else "auto",
+            torch_dtype=torch.float16 if is_half else ("auto" if torch.cuda.is_available() else torch.float32),
            load_in_8bit=int8,
            device_map="auto" if auto_device_map else None,
            low_cpu_mem_usage=True,
--- a/setup.sh
+++ b/setup.sh
@ -23,6 +23,8 @@ check_dep curl
 check_dep zstd
 check_dep docker
 ############### Common configuration ###############
 # Read number of GPUs
 read -rp "Enter number of GPUs [1]: " NUM_GPUS
 NUM_GPUS=${NUM_GPUS:-1}
@ -36,21 +38,23 @@ TRITON_HOST=${TRITON_HOST:-triton}
 read -rp "Port of Triton host [8001]: " TRITON_PORT
 TRITON_PORT=${TRITON_PORT:-8001}
-# Read model directory
+# Read models root directory (all models go under this)
-read -rp "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
+read -rp "Where do you want to save your models [$(pwd)/models]? " MODELS_ROOT_DIR
-if [ -z "$MODEL_DIR" ]; then
+if [ -z "$MODELS_ROOT_DIR" ]; then
-    MODEL_DIR="$(pwd)/models"
+    MODELS_ROOT_DIR="$(pwd)/models"
 else
-    MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
+    MODELS_ROOT_DIR="$(readlink -m "${MODELS_ROOT_DIR}")"
 fi
 mkdir -p "$MODELS_ROOT_DIR"
 # Write .env
 echo "NUM_GPUS=${NUM_GPUS}" >> .env
-echo "MODEL_DIR=${MODEL_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
+echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
 echo "API_EXTERNAL_PORT=${API_EXTERNAL_PORT}" >> .env
 echo "TRITON_HOST=${TRITON_HOST}" >> .env
 echo "TRITON_PORT=${TRITON_PORT}" >> .env
-echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
+
 ############### Backend specific configuration ###############
 function fastertransformer_backend(){
    echo "Models available:"
@ -78,10 +82,11 @@ function fastertransformer_backend(){
        *) MODEL="codegen-6B-multi" ;;
    esac
-    echo "MODEL=${MODEL}" > .env
+    echo "MODEL=${MODEL}" >> .env
    echo "MODEL_DIR=${MODELS_ROOT_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
-    if (test -d "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
+    if (test -d "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
-      echo "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
+      echo "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
      echo "Converted model for ${MODEL}-${NUM_GPUS}gpu already exists."
      read -rp "Do you want to re-use it? y/n: " REUSE_CHOICE
      if [[ ${REUSE_CHOICE:-y} =~ ^[Yy]$ ]]
@ -90,7 +95,7 @@ function fastertransformer_backend(){
        echo "Re-using model"
      else
        DOWNLOAD_MODEL=y
-        rm -rf "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
+        rm -rf "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
      fi
    else
      DOWNLOAD_MODEL=y
@ -98,21 +103,19 @@ function fastertransformer_backend(){
    if [[ ${DOWNLOAD_MODEL:-y} =~ ^[Yy]$ ]]
    then
      # Create model directory
      mkdir -p "${MODEL_DIR}"
      if [ "$NUM_GPUS" -le 2 ]; then
        echo "Downloading the model from HuggingFace, this will take a while..."
        SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
        DEST="${MODEL}-${NUM_GPUS}gpu"
-        ARCHIVE="${MODEL_DIR}/${DEST}.tar.zst"
+        ARCHIVE="${MODELS_ROOT_DIR}/${DEST}.tar.zst"
-        cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODEL_DIR}"
+        cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODELS_ROOT_DIR}"
        curl -L "https://huggingface.co/moyix/${MODEL}-gptj/resolve/main/${MODEL}-${NUM_GPUS}gpu.tar.zst" \
            -o "$ARCHIVE"
-        zstd -dc "$ARCHIVE" | tar -xf - -C "${MODEL_DIR}"
+        zstd -dc "$ARCHIVE" | tar -xf - -C "${MODELS_ROOT_DIR}"
        rm -f "$ARCHIVE"
      else
        echo "Downloading and converting the model, this will take a while..."
-        docker run --rm -v "${MODEL_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
+        docker run --rm -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
      fi
    fi
 }
@ -154,15 +157,15 @@ function python_backend(){
    fi
    # Write config.env
-    echo "MODEL=py-${MODEL}" > config.env
+    echo "MODEL=py-${MODEL}" >> .env
-    echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> config.env
+    echo "MODEL_DIR=${MODELS_ROOT_DIR}/py-${ORG}-${MODEL}" >> .env  # different format from fastertransformer backend
    echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env
-    # Create model directory
+    python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODELS_ROOT_DIR}" --use_int8 "${USE_INT8}"
    mkdir -p "${MODEL_DIR}/"
    python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODEL_DIR}" --use_int8 "${USE_INT8}"
 }
 common_config
 # choose backend
 echo "Choose your backend:"
 echo "[1] FasterTransformer backend (faster, but limited models)"
--- a/tests/python_backend/.gitignore
+++ b/tests/python_backend/.gitignore
@ -0,0 +1,2 @@
 test.env
 models/*
--- a/tests/python_backend/docker-compose-with-gpus.yaml
+++ b/tests/python_backend/docker-compose-with-gpus.yaml
@ -0,0 +1,35 @@
 version: '3.3'
 services:
  triton:
    build:
      context: ../../
      dockerfile: Dockerfile
    command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
    shm_size: '2gb'
    volumes:
      - ${MODEL_DIR}:/model
      - ${HF_CACHE_DIR}:/root/.cache/huggingface
    ports:
      - "8000:8000"
      - "8001:8001"
      - "8002:8002"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
  copilot_proxy:
    # For dockerhub version
    # image: moyix/copilot_proxy:latest
    # command: python3 -m flask run --host=0.0.0.0 --port=5000
    # For local build
    build:
      context: ../../
      dockerfile: copilot_proxy/Dockerfile
    env_file:
      # Automatically created via ./setup.sh
      - test.env
    ports:
      - "${API_EXTERNAL_PORT}:5000"
--- a/tests/python_backend/docker-compose-without-gpus.yaml
+++ b/tests/python_backend/docker-compose-without-gpus.yaml
@ -0,0 +1,28 @@
 version: '3.3'
 services:
  triton:
    build:
      context: ../../
      dockerfile: Dockerfile
    command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
    shm_size: '2gb'
    volumes:
      - ${MODEL_DIR}:/model
      - ${HF_CACHE_DIR}:/root/.cache/huggingface
    ports:
      - "8000:8000"
      - "8001:8001"
      - "8002:8002"
  copilot_proxy:
    # For dockerhub version
    # image: moyix/copilot_proxy:latest
    # command: python3 -m flask run --host=0.0.0.0 --port=5000
    # For local build
    build:
      context: ../../
      dockerfile: copilot_proxy/Dockerfile
    env_file:
      # Automatically created via ./setup.sh
      - test.env
    ports:
      - "${API_EXTERNAL_PORT}:5000"
--- a/tests/python_backend/test_setup.py
+++ b/tests/python_backend/test_setup.py
@ -0,0 +1,164 @@
 "Tests setup script (currently for Python backend)"
 import os
 import subprocess
 import signal
 import shutil
 from pathlib import Path
 from typing import Dict, Union
 import pexpect
 import pytest
 import requests
 curdir = Path(__file__).parent
 root = curdir.parent.parent
 test_models_dir = curdir/"models"
 def setup_module():
    "Setup steps for tests in this module"
    assert (root/"setup.sh").exists(), "setup.sh not found"
    if (root/".env").exists():
        shutil.move(str(root/".env"), str(root/".env.bak"))
 def teardown_module():
    "Teardown steps for tests in this module"
    if (root/".env.bak").exists():
        shutil.move(str(root/".env.bak"), str(root/".env"))
    try:
        if test_models_dir.exists():
            shutil.rmtree(test_models_dir)
    except Exception as exc:
        print(
            f"WARNING: Couldn't delete `{test_models_dir}` most likely due to permission issues."
            f"Run the tests with sudo to ensure this gets deleted automatically, or else delete manually. Exception: {exc}"
        )
 def enter_input(proc: pexpect.spawn, expect: str, input_s: str, timeout: int = 5) -> str:
    "Helper function to enter input for a given prompt. Returns consumed output."
    try:
        proc.expect(expect, timeout=timeout)
    except pexpect.exceptions.TIMEOUT as exc:
        raise AssertionError(
            f"Timeout waiting for prompt: `{expect}`.\n"
            f"Output-before: `{proc.before}`\nOutput-after: `{proc.after}`"
        ) from exc
    after = str(proc.after)
    print(after)
    proc.sendline(input_s)
    return after
 def run_common_setup_steps(n_gpus: int = 0) -> pexpect.spawn:
    "Helper function to run common setup steps."
    proc = pexpect.pty_spawn.spawn(
        "./setup.sh 2>&1", encoding="utf-8", cwd=str(root),
    )
    proc.ignorecase = True
    enter_input(proc, r".*Enter number of GPUs[^:]+: ?", str(n_gpus))
    enter_input(proc, r".*port for the API[^:]+: ?", "5000")
    enter_input(proc, r".*Address for Triton[^:]+: ?", "triton")
    enter_input(proc, r".*Port of Triton[^:]+: ?", "8001")
    enter_input(proc, r".*save your models[^\?]+\? ?", str(test_models_dir.absolute()))
    return proc
 def load_test_env():
    "Load test env vars"
    # Without loading default env vars, PATH won't be set correctly
    env = os.environ.copy()
    with open(curdir/"test.env", "r", encoding="utf8") as test_env:
        for line in test_env:
            key, val = line.strip().split("=")
            env[key] = val
    return env
 def run_inference(
    prompt: str, model_name: str = "py-model", port: int = 5000, return_all: bool = False,
    **kwargs
 ) -> Union[str, Dict]:
    "Invokes the copilot proxy with the given prompt and returns the completion"
    endpoint = f"http://localhost:{port}/v1/engines/codegen/completions"
    data = {
        "model": model_name,
        "prompt": prompt,
        "suffix": kwargs.get("suffix", ""),
        "max_tokens": kwargs.get("max_tokens", 16),
        "temperature": kwargs.get("temperature", 0.0),
        "top_p": kwargs.get("top_p", 1.0),
        "n": kwargs.get("n", 1),
        "stream": kwargs.get("stream", None),  # it's not true/false. It's None or not None :[
        "logprobs": kwargs.get("logprobs", 0),
        "stop": kwargs.get("stop", ""),
        "echo": kwargs.get("echo", True),
        "presence_penalty": kwargs.get("presence_penalty", 0.0),
        "frequency_penalty": kwargs.get("frequency_penalty", 0.0),
        "best_of": kwargs.get("best_of", 1),
        "logit_bias": kwargs.get("logit_bias", {}),
        "user": kwargs.get("user", "test"),
    }
    response = requests.post(endpoint, json=data)
    response.raise_for_status()
    if return_all:
        return response.json()
    return response.json()["choices"][0]["text"]
@pytest.mark.parametrize("n_gpus", [0])  # we don't have a GPU on CI
 def test_python_backend(n_gpus: int):
    """
    Step 1: run $root/setup.sh while passing appropriate options via stdin
    Step 2: run docker-compose up with test.env sourced
    Step 3: call :5000 with appropriate request
    """
    proc = run_common_setup_steps(n_gpus)
    choices = enter_input(proc, r".*Choose your backend.*Enter your choice[^:]+: ?", "2")
    assert "[2] Python backend" in choices, "Option 2 should be Python backend"
    choices = enter_input(proc, r".*Models available:.*Enter your choice[^:]+: ?", "1")
    assert "[1] codegen-350M-mono" in choices, "Option 1 should be codegen-350M-mono"
    enter_input(proc, r".*share (your )?huggingface cache[^:]+: ?", "y")
    enter_input(proc, r".*cache directory[^:]+: ?", "")  # default
    enter_input(proc, r".*use int8[^:]+: ?", "n")
    enter_input(proc, r".*run FauxPilot\? \[y/n\] ", "n")
    # copy $root/.env to $curdir/test.env
    shutil.copy(str(root/".env"), str(curdir/"test.env"))
    # run docker-compose up -f docker-compose-{without|with}-gpus.yml
    compose_file = f"docker-compose-with{'' if n_gpus > 0 else 'out'}-gpus.yaml"
    docker_proc = None
    try:
        docker_proc = pexpect.pty_spawn.spawn(
            f"docker-compose -f {compose_file} up",
            encoding="utf-8",
            cwd=curdir,
            env=load_test_env(),
        )
        print("Waiting for API to be ready...")
        docker_proc.expect(r".*Started GRPCInferenceService at 0.0.0.0:8001", timeout=120)
        print("API ready, sending request...")
        # Simple test 1: hello world prompt without bells and whistles
        response = run_inference("def hello_world():\n", max_tokens=16, return_all=True)
        assert response["choices"][0]["text"].rstrip() == '    print("Hello World")\n\nhello_world()\n\n#'
        assert response["choices"][0]["finish_reason"] == "length"
    finally:
        if docker_proc is not None and docker_proc.isalive():
            docker_proc.kill(signal.SIGINT)
        # killing docker-compose process doesn't bring down the containers.
        # explicitly stop the containers:
        subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True)