Resolve merge conflicts and fix issues with setup.sh

2025-07-15 01:23:25 -07:00 · 2022-10-20 16:09:12 +02:00 · 2022-10-20 16:09:12 +02:00 · 2a91018792
commit 2a91018792
parent 4f936c3049 31d2349dbb
9 changed files with 115 additions and 92 deletions
--- a/copilot_proxy/Dockerfile
+++ b/copilot_proxy/Dockerfile
@ -10,4 +10,4 @@ COPY copilot_proxy .
 EXPOSE 5000
-CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
+CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
--- a/copilot_proxy/app.py
+++ b/copilot_proxy/app.py
@ -41,4 +41,4 @@ async def completions(data: OpenAIinput):
        )
 if __name__ == "__main__":
-    uvicorn.run("app:app", host=os.environ.get("API_HOST", "0.0.0.0"), port=os.environ.get("API_PORT", 5000))
+    uvicorn.run("app:app", host="0.0.0.0", port=5000)
--- a/copilot_proxy/models.py
+++ b/copilot_proxy/models.py
@ -4,7 +4,7 @@ from pydantic import BaseModel
 class OpenAIinput(BaseModel):
-    model: str = "fastertransformer|py-model"
+    model: str = "fastertransformer"
    prompt: Optional[str]
    suffix: Optional[str]
    max_tokens: Optional[int] = 16
--- a/copilot_proxy/utils/codegen.py
+++ b/copilot_proxy/utils/codegen.py
@ -6,14 +6,14 @@ import time
 import numpy as np
 import tritonclient.grpc as client_util
 from tokenizers import Tokenizer
-from tritonclient.utils import np_to_triton_dtype
+from tritonclient.utils import np_to_triton_dtype, InferenceServerException
 np.finfo(np.dtype("float32"))
 np.finfo(np.dtype("float64"))
 class CodeGenProxy:
-    def __init__(self, host: str = 'localhost', port: int = 8001, verbose: bool = False):
+    def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
        self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
        self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
        self.PAD_CHAR = 50256
@ -234,7 +234,12 @@ class CodeGenProxy:
    def __call__(self, data: dict):
        st = time.time()
        try:
            completion, choices = self.generate(data)
        except InferenceServerException as E:
            print(E)
            completion = {}
            choices = []
        ed = time.time()
        print(f"Returned completion in {(ed - st) * 1000} ms")
        if data.get('stream', False):
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -28,9 +28,8 @@ services:
    build:
      context: .
      dockerfile: copilot_proxy/Dockerfile
    command: uvicorn app:app --host 0.0.0.0 --port 5000
    env_file:
-      # You can modify this env file to configure your proxy environment
+      # Automatically created via ./setup.sh
-      - example.env
+      - .env
    ports:
-      - "5000:5000"
+      - "${API_EXTERNAL_PORT}:5000"
--- a/launch.sh
+++ b/launch.sh
@ -1,13 +1,14 @@
 #!/usr/bin/env bash
-# Read in config.env file; error if not found
+# Read in .env file; error if not found
-if [ ! -f config.env ]; then
+if [ ! -f .env ]; then
-    echo "config.env not found, please run setup.sh"
+    echo ".env not found, running setup.sh"
-    exit 1
+    bash setup.sh
 fi
-source config.env
+source .env
 export NUM_GPUS=${NUM_GPUS}
 export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
 # if model name starts with "py-", it means we're dealing with the python backend.
 if [[ $(echo "$MODEL" | cut -c1-3) == "py-" ]]; then
@ -16,12 +17,7 @@ else
    export MODEL_DIR="${MODEL_DIR}"/"${MODEL}-${NUM_GPUS}gpu"
 fi
 export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 # On newer versions, docker-compose is docker compose
-if command -v docker-compose > /dev/null; then
+docker compose up -d --remove-orphans || docker-compose up -d --remove-orphans
    docker compose up
 else
    docker-compose up
 fi
--- a/python_backend/init_model.py
+++ b/python_backend/init_model.py
@ -3,12 +3,13 @@ A simple script that sets up the model directory of a given model for Triton.
 """
 import argparse
 import os
 import shutil
 from pathlib import Path
 from string import Template
 SCRIPT_DIR = Path(__file__).parent
-CONFIG_TEMPLATE_PATH = SCRIPT_DIR/'config_template.pbtxt'
+CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True)
@ -21,11 +22,11 @@ args = parser.parse_args()
 # Step1: Make model directory
-model_dir_path = Path(args.model_dir)/f"py-{args.model_name}/py-model/1"
+model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.model_name}/py-model/1"))
 model_dir_path.mkdir(parents=True, exist_ok=True)
 # Step 2: copy model.py
-shutil.copy(SCRIPT_DIR/'model.py', model_dir_path/'model.py')
+shutil.copy(os.path.join(SCRIPT_DIR, 'model.py'), os.path.join(model_dir_path, 'model.py'))
 # Step 3: Generate config.pbtxt
 with open(CONFIG_TEMPLATE_PATH, 'r') as f:
@ -38,5 +39,6 @@ config = template.substitute(
    use_int8=args.use_int8,
    use_auto_device_map=args.use_auto_device_map,
 )
-with open(model_dir_path/'../config.pbtxt', 'w') as f:
+with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
    f.write(config)
    print(f"Config written to")
--- a/setup.sh
+++ b/setup.sh
@ -1,9 +1,15 @@
 #!/usr/bin/env bash
-if [ -f config.env ]; then
+if [ -f .env ]; then
-    echo "config.env already exists, skipping"
+    read -rp ".env already exists, do you want to delete .env and recreate it? [y/n] " DELETE
-    echo "Please delete config.env if you want to re-run this script"
+    if [[ ${DELETE:-y} =~ ^[Yy]$ ]]
-    exit 1
+    then
      echo "Deleting .env"
      rm .env
    else
      echo "Exiting"
      exit 0
    fi;
 fi
 function check_dep(){
@ -17,6 +23,34 @@ check_dep curl
 check_dep zstd
 check_dep docker
 # Read number of GPUs
 read -rp "Enter number of GPUs [1]: " NUM_GPUS
 NUM_GPUS=${NUM_GPUS:-1}
 read -rp "External port for the API [5000]: " API_EXTERNAL_PORT
 API_EXTERNAL_PORT=${API_EXTERNAL_PORT:-5000}
 read -rp "Address for Triton [triton]: " TRITON_HOST
 TRITON_HOST=${TRITON_HOST:-triton}
 read -rp "Port of Triton host [8001]: " TRITON_PORT
 TRITON_PORT=${TRITON_PORT:-8001}
 # Read model directory
 read -rp "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
 if [ -z "$MODEL_DIR" ]; then
    MODEL_DIR="$(pwd)/models"
 else
    MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
 fi
 # Write .env
 echo "NUM_GPUS=${NUM_GPUS}" >> .env
 echo "MODEL_DIR=${MODEL_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
 echo "API_EXTERNAL_PORT=${API_EXTERNAL_PORT}" >> .env
 echo "TRITON_HOST=${TRITON_HOST}" >> .env
 echo "TRITON_PORT=${TRITON_PORT}" >> .env
 echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
 function fastertransformer_backend(){
    echo "Models available:"
@ -29,7 +63,7 @@ function fastertransformer_backend(){
    echo "[7] codegen-16B-mono (32GB total VRAM required; Python-only)"
    echo "[8] codegen-16B-multi (32GB total VRAM required; multi-language)"
    # Read their choice
-    read -p "Enter your choice [6]: " MODEL_NUM
+    read -rp "Enter your choice [6]: " MODEL_NUM
    # Convert model number to model name
    case $MODEL_NUM in
@ -44,36 +78,29 @@ function fastertransformer_backend(){
        *) MODEL="codegen-6B-multi" ;;
    esac
-    # Read number of GPUs
+    echo "MODEL=${MODEL}" > .env
    read -p "Enter number of GPUs [1]: " NUM_GPUS
    NUM_GPUS=${NUM_GPUS:-1}
-    # Read model directory
+    if (test -d "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
-    read -p "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
+      echo "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
    if [ -z "$MODEL_DIR" ]; then
        MODEL_DIR="$(pwd)/models"
    else
        MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
    fi
    # Write config.env
    echo "MODEL=${MODEL}" > config.env
    echo "NUM_GPUS=${NUM_GPUS}" >> config.env
    echo "MODEL_DIR=${MODEL_DIR}" >> config.env
    if [ -d "$MODEL_DIR"/"${MODEL}"-${NUM_GPUS}gpu ]; then
      echo "Converted model for ${MODEL}-${NUM_GPUS}gpu already exists."
-        read -p "Do you want to re-use it? y/n: " REUSE_CHOICE
+      read -rp "Do you want to re-use it? y/n: " REUSE_CHOICE
-        if [ "${REUSE_CHOICE^^}" = "Y" ]; then
+      if [[ ${REUSE_CHOICE:-y} =~ ^[Yy]$ ]]
-            exit 0
+      then
        DOWNLOAD_MODEL=n
        echo "Re-using model"
      else
        DOWNLOAD_MODEL=y
        rm -rf "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
      fi
    else
      DOWNLOAD_MODEL=y
    fi
    if [[ ${DOWNLOAD_MODEL:-y} =~ ^[Yy]$ ]]
    then
      # Create model directory
      mkdir -p "${MODEL_DIR}"
-
+      if [ "$NUM_GPUS" -le 2 ]; then
    # For some of the models we can download it preconverted.
    if [ $NUM_GPUS -le 2 ]; then
        echo "Downloading the model from HuggingFace, this will take a while..."
        SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
        DEST="${MODEL}-${NUM_GPUS}gpu"
@ -85,9 +112,9 @@ function fastertransformer_backend(){
        rm -f "$ARCHIVE"
      else
        echo "Downloading and converting the model, this will take a while..."
-        docker run --rm -v ${MODEL_DIR}:/models -e MODEL=${MODEL} -e NUM_GPUS=${NUM_GPUS} moyix/model_converter:latest
+        docker run --rm -v "${MODEL_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
      fi
    fi
    echo "Done! Now run ./launch.sh to start the FauxPilot server."
 }
 function python_backend(){
@ -96,12 +123,8 @@ function python_backend(){
    echo "[2] codegen-350M-multi (1GB total VRAM required; multi-language)"
    echo "[3] codegen-2B-mono (4GB total VRAM required; Python-only)"
    echo "[4] codegen-2B-multi (4GB total VRAM required; multi-language)"
-    # echo "[5] codegen-6B-mono (13GB total VRAM required; Python-only)"
+
-    # echo "[6] codegen-6B-multi (13GB total VRAM required; multi-language)"
+    read -rp "Enter your choice [4]: " MODEL_NUM
    # echo "[7] codegen-16B-mono (32GB total VRAM required; Python-only)"
    # echo "[8] codegen-16B-multi (32GB total VRAM required; multi-language)"
    # Read their choice
    read -p "Enter your choice [4]: " MODEL_NUM
    # Convert model number to model name
    case $MODEL_NUM in
@ -109,36 +132,22 @@ function python_backend(){
        2) MODEL="codegen-350M-multi"; ORG="Salesforce" ;;
        3) MODEL="codegen-2B-mono"; ORG="Salesforce" ;;
        4) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
        *) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
    esac
    # Read number of GPUs -- not strictly required for python backend, because of device_map="auto",
    # but docker-compose.py uses it to select CUDA_VISIBLE_DEVICES
    read -p "Enter number of GPUs [1]: " NUM_GPUS
    NUM_GPUS=${NUM_GPUS:-1}
    # Read model directory
    read -p "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
    MODEL_DIR=${MODEL_DIR:-$(pwd)/models}
    if [ -z "$MODEL_DIR" ]; then
        MODEL_DIR="$(pwd)/models"
    else
        MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
    fi
    # share huggingface cache? Should be safe to share, but permission issues may arise depending upon your docker setup
-    read -p "Do you want to share your huggingface cache between host and docker container? y/n [n]: " SHARE_HF_CACHE
+    read -rp "Do you want to share your huggingface cache between host and docker container? y/n [n]: " SHARE_HF_CACHE
    SHARE_HF_CACHE=${SHARE_HF_CACHE:-n}
-    if [ "${SHARE_HF_CACHE^^}" = "Y" ]; then
+    if [[ ${SHARE_HF_CACHE:-y} =~ ^[Yy]$ ]]; then
-        read -p "Enter your huggingface cache directory [$HOME/.cache/huggingface]: " HF_CACHE_DIR
+        read -rp "Enter your huggingface cache directory [$HOME/.cache/huggingface]: " HF_CACHE_DIR
        HF_CACHE_DIR=${HF_CACHE_DIR:-$HOME/.cache/huggingface}
    else
        HF_CACHE_DIR="/tmp/hf_cache"
    fi
    # use int8? Allows larger models to fit in GPU but might be very marginally slower
-    read -p "Do you want to use int8? y/n [y]: " USE_INT8
+    read -rp "Do you want to use int8? y/n [y]: " USE_INT8
-    USE_INT8=${USE_INT8:-y}
+    if [[ ${USE_INT8:-y} =~ ^[Nn]$ ]]; then
    if [ "${USE_INT8^^}" = "N" ]; then
        USE_INT8="0"
    else
        USE_INT8="1"
@ -146,25 +155,31 @@ function python_backend(){
    # Write config.env
    echo "MODEL=py-${MODEL}" > config.env
    echo "NUM_GPUS=${NUM_GPUS}" >> config.env
    echo "MODEL_DIR=${MODEL_DIR}" >> config.env
    echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> config.env
    # Create model directory
    mkdir -p "${MODEL_DIR}/"
    python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODEL_DIR}" --use_int8 "${USE_INT8}"
-    echo "Done! Now run ./launch.sh to start the FauxPilot server."
+    python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODEL_DIR}" --use_int8 "${USE_INT8}"
 }
 # choose backend
 echo "Choose your backend:"
 echo "[1] FasterTransformer backend (faster, but limited models)"
 echo "[2] Python backend (slower, but more models, and allows loading with int8)"
-read -p "Enter your choice [1]: " BACKEND_NUM
+read -rp "Enter your choice [1]: " BACKEND_NUM
-if [ $BACKEND_NUM -eq 2 ]; then
+if [ "$BACKEND_NUM" -eq 2 ]; then
    python_backend
 else
    fastertransformer_backend
 fi
 read -rp "Config complete, do you want to run FauxPilot? [y/n] " RUN
 if [[ ${RUN:-y} =~ ^[Yy]$ ]]
 then
  bash ./launch.sh
 else
  echo "You can run ./launch.sh to start the FauxPilot server."
  exit 0
 fi;
--- a/shutdown.sh
+++ b/shutdown.sh
@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 source .env
 # On newer versions, docker-compose is docker compose
 docker compose down --remove-orphans || docker-compose down --remove-orphans
`@ -10,4 +10,4 @@ COPY copilot_proxy .`

	`EXPOSE 5000`	`EXPOSE 5000`

	`CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]`	`CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]`