mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-08-21 05:44:09 -07:00
Merge pull request #86 from thakkarparth007/python_backend
Add python backend support
This commit is contained in:
commit
92dc571108
11 changed files with 705 additions and 76 deletions
5
Dockerfile
Normal file
5
Dockerfile
Normal file
|
@ -0,0 +1,5 @@
|
|||
FROM moyix/triton_with_ft:22.09
|
||||
|
||||
# Install dependencies: torch
|
||||
RUN python3 -m pip install --disable-pip-version-check -U torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||
RUN python3 -m pip install --disable-pip-version-check -U transformers bitsandbytes accelerate
|
|
@ -73,13 +73,17 @@ class CodeGenProxy:
|
|||
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
||||
|
||||
def generate(self, data):
|
||||
model_name = "fastertransformer"
|
||||
prompt = data['prompt']
|
||||
n = data.get('n', 1)
|
||||
model_name = data["model"]
|
||||
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
||||
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
||||
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
||||
|
||||
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
|
||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
||||
prompt_len = input_start_ids.shape[1]
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
max_tokens = data.get('max_tokens', 16)
|
||||
prompt_tokens: int = input_len[0][0]
|
||||
requested_tokens = max_tokens + prompt_tokens
|
||||
|
@ -90,7 +94,7 @@ class CodeGenProxy:
|
|||
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
||||
f"Please reduce your prompt; or completion length."
|
||||
)
|
||||
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
|
||||
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||
num_logprobs = data.get('logprobs', -1)
|
||||
if num_logprobs is None:
|
||||
num_logprobs = 1
|
||||
|
@ -105,7 +109,7 @@ class CodeGenProxy:
|
|||
|
||||
top_p = data.get('top_p', 1.0)
|
||||
frequency_penalty = data.get('frequency_penalty', 1.0)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
||||
|
@ -113,9 +117,9 @@ class CodeGenProxy:
|
|||
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||
|
||||
stop_words = data.get('stop', [])
|
||||
if stop_words is None:
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
version: '3.3'
|
||||
services:
|
||||
triton:
|
||||
image: moyix/triton_with_ft:22.09
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||
shm_size: '2gb'
|
||||
volumes:
|
||||
- ${MODEL_DIR}:/model
|
||||
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "8001:8001"
|
||||
|
|
180
python_backend/config_template.pbtxt
Normal file
180
python_backend/config_template.pbtxt
Normal file
|
@ -0,0 +1,180 @@
|
|||
name: "py-model"
|
||||
backend: "python"
|
||||
max_batch_size: 4
|
||||
input [
|
||||
{
|
||||
name: "input_ids"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ -1 ]
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "start_id"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "end_id"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
name: "input_lengths"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
},
|
||||
{
|
||||
name: "request_output_len"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ -1 ]
|
||||
},
|
||||
{
|
||||
name: "runtime_top_k"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
name: "runtime_top_p"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "beam_search_diversity_rate"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
name: "temperature"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "len_penalty"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "repetition_penalty"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "random_seed"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "is_return_log_probs"
|
||||
data_type: TYPE_BOOL
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "beam_width"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 1 ]
|
||||
reshape: { shape: [ ] }
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "bad_words_list"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 2, -1 ]
|
||||
optional: true
|
||||
},
|
||||
{
|
||||
# UNUSED
|
||||
name: "stop_words_list"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ 2, -1 ]
|
||||
optional: true
|
||||
}
|
||||
]
|
||||
output [
|
||||
{
|
||||
name: "output_ids"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ -1, -1, -1 ]
|
||||
},
|
||||
{
|
||||
name: "sequence_length"
|
||||
data_type: TYPE_INT32
|
||||
dims: [ -1, -1 ]
|
||||
} #,
|
||||
# Following is currently unsupported, but should be supported in the future
|
||||
# {
|
||||
# name: "cum_log_probs"
|
||||
# data_type: TYPE_FP32
|
||||
# dims: [ -1 ]
|
||||
# },
|
||||
# {
|
||||
# name: "output_log_probs"
|
||||
# data_type: TYPE_FP32
|
||||
# dims: [ -1, -1 ]
|
||||
# }
|
||||
]
|
||||
# unsure what this is for
|
||||
instance_group [
|
||||
{
|
||||
count: 1
|
||||
kind: KIND_CPU
|
||||
}
|
||||
]
|
||||
parameters {
|
||||
key: "use_half"
|
||||
value: {
|
||||
string_value: "1"
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
key: "model_name"
|
||||
value: {
|
||||
string_value: "${model_name}" # e.g. "codegen-350M-multi"
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
key: "org_name"
|
||||
value: {
|
||||
string_value: "${org_name}" # e.g. "Salesforce"
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
key: "use_int8",
|
||||
value: {
|
||||
string_value: "${use_int8}" # e.g. "0" or "1"
|
||||
}
|
||||
}
|
||||
parameters {
|
||||
key: "use_auto_device_map",
|
||||
value: {
|
||||
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
|
||||
}
|
||||
}
|
44
python_backend/init_model.py
Normal file
44
python_backend/init_model.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
A simple script that sets up the model directory of a given model for Triton.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from string import Template
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent
|
||||
CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_dir", type=str, required=True)
|
||||
parser.add_argument("--model_name", type=str, required=True)
|
||||
parser.add_argument("--org_name", type=str, required=True)
|
||||
parser.add_argument("--use_half", type=str, default="1")
|
||||
parser.add_argument("--use_int8", type=str, default="0")
|
||||
parser.add_argument("--use_auto_device_map", type=str, default="1")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
# Step1: Make model directory
|
||||
model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.org_name}-{args.model_name}/py-model/1"))
|
||||
model_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Step 2: copy model.py
|
||||
shutil.copy(os.path.join(SCRIPT_DIR, 'model.py'), os.path.join(model_dir_path, 'model.py'))
|
||||
|
||||
# Step 3: Generate config.pbtxt
|
||||
with open(CONFIG_TEMPLATE_PATH, 'r') as f:
|
||||
template = Template(f.read())
|
||||
|
||||
config = template.substitute(
|
||||
org_name=args.org_name,
|
||||
model_name=args.model_name,
|
||||
use_half=args.use_half,
|
||||
use_int8=args.use_int8,
|
||||
use_auto_device_map=args.use_auto_device_map,
|
||||
)
|
||||
with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
|
||||
f.write(config)
|
||||
print(f"Config written to {os.path.abspath(f.name)}")
|
102
python_backend/model.py
Normal file
102
python_backend/model.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
import json
|
||||
|
||||
import torch
|
||||
import triton_python_backend_utils as pb_utils
|
||||
# Using dlpack causes segfaults on some machines, so not using it for now
|
||||
# But it supports zero copy transfer from triton tensors to torch tensors,
|
||||
# so worth investigating further
|
||||
# from torch.utils.dlpack import to_dlpack, from_dlpack
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
def pb2torch(request, name):
|
||||
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
||||
return torch.from_numpy(tensor.as_numpy())
|
||||
# return from_dlpack(tensor.to_dlpack())
|
||||
|
||||
|
||||
def torch2pb(name, tensor):
|
||||
return pb_utils.Tensor(name, tensor.numpy())
|
||||
# return pb_utils.Tensor.from_dlpack(name, to_dlpack(tensor))
|
||||
|
||||
|
||||
class TritonPythonModel:
|
||||
def initialize(self, args):
|
||||
self.model_config = model_config = json.loads(args["model_config"])
|
||||
org_name = model_config["parameters"].get("org_name", {"string_value": "Salesforce"})["string_value"]
|
||||
model_name = org_name + "/" + model_config["parameters"]["model_name"]["string_value"]
|
||||
|
||||
def get_bool(x):
|
||||
return model_config["parameters"][x]["string_value"].lower() in ["1", "true"]
|
||||
|
||||
is_half = get_bool("use_half") and torch.cuda.is_available()
|
||||
# This will make inference marginally slower, but will allow bigger models to fit in GPU
|
||||
int8 = get_bool("use_int8") and torch.cuda.is_available()
|
||||
auto_device_map = get_bool("use_auto_device_map") and torch.cuda.is_available()
|
||||
|
||||
print("Cuda available?", torch.cuda.is_available())
|
||||
print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch.float16 if is_half else ("auto" if torch.cuda.is_available() else torch.float32),
|
||||
load_in_8bit=int8,
|
||||
device_map="auto" if auto_device_map else None,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
print(f"Model {model_name} Loaded. Footprint: {self.model.get_memory_footprint()}")
|
||||
|
||||
# set max_batch_size
|
||||
self.max_batch_size = 0 # model_config["max_batch_size"]
|
||||
|
||||
def execute(self, requests):
|
||||
# TODO: don't just loop over requests. batch them up
|
||||
|
||||
responses = []
|
||||
|
||||
for request in requests:
|
||||
input_ids_torch = pb2torch(request, "input_ids")
|
||||
input_lengths_torch = pb2torch(request, "input_lengths")
|
||||
request_output_len_torch = pb2torch(request, "request_output_len")
|
||||
|
||||
# Attention mask
|
||||
attention_mask = None
|
||||
if input_lengths_torch.min() != input_lengths_torch.max():
|
||||
attention_mask = torch.zeros(input_ids_torch.shape, dtype=torch.long)
|
||||
for i, l in enumerate(input_lengths_torch):
|
||||
attention_mask[i, :l] = 1
|
||||
|
||||
# Output length
|
||||
max_new_tokens = request_output_len_torch[0][0]
|
||||
|
||||
top_k = pb_utils.get_input_tensor_by_name(request, "runtime_top_k").as_numpy().tolist()[0]
|
||||
top_p = pb_utils.get_input_tensor_by_name(request, "runtime_top_p").as_numpy().tolist()[0]
|
||||
temperature = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy().tolist()[0]
|
||||
# n_samples = pb_utils.get_input_tensor_by_name(request, "n")
|
||||
n_samples = 1 # TODO: client doesn't send this yet. instead it duplicates the request n times
|
||||
|
||||
# Generate
|
||||
output_ids = self.model.generate(
|
||||
input_ids=input_ids_torch, attention_mask=attention_mask,
|
||||
max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=n_samples,
|
||||
temperature=temperature,
|
||||
)
|
||||
|
||||
# client wants batch x beam_width x seq_len and we don't support beam_width yet
|
||||
output_ids = output_ids.unsqueeze(1)
|
||||
|
||||
# create output tensors
|
||||
out_tensor_pb = torch2pb("output_ids", output_ids)
|
||||
|
||||
# calculate sequence_length
|
||||
sequence_length = torch.zeros(output_ids.shape[:2], dtype=torch.int32)
|
||||
for i in range(output_ids.shape[0]):
|
||||
sequence_length[i, 0] = torch.sum(output_ids[i, 0] != self.model.config.eos_token_id).item()
|
||||
sequence_length_pb = torch2pb("sequence_length", sequence_length)
|
||||
|
||||
# create response
|
||||
response = pb_utils.InferenceResponse([out_tensor_pb, sequence_length_pb])
|
||||
responses.append(response)
|
||||
|
||||
return responses
|
139
setup.sh
139
setup.sh
|
@ -23,7 +23,40 @@ check_dep curl
|
|||
check_dep zstd
|
||||
check_dep docker
|
||||
|
||||
############### Common configuration ###############
|
||||
|
||||
# Read number of GPUs
|
||||
read -rp "Enter number of GPUs [1]: " NUM_GPUS
|
||||
NUM_GPUS=${NUM_GPUS:-1}
|
||||
|
||||
read -rp "External port for the API [5000]: " API_EXTERNAL_PORT
|
||||
API_EXTERNAL_PORT=${API_EXTERNAL_PORT:-5000}
|
||||
|
||||
read -rp "Address for Triton [triton]: " TRITON_HOST
|
||||
TRITON_HOST=${TRITON_HOST:-triton}
|
||||
|
||||
read -rp "Port of Triton host [8001]: " TRITON_PORT
|
||||
TRITON_PORT=${TRITON_PORT:-8001}
|
||||
|
||||
# Read models root directory (all models go under this)
|
||||
read -rp "Where do you want to save your models [$(pwd)/models]? " MODELS_ROOT_DIR
|
||||
if [ -z "$MODELS_ROOT_DIR" ]; then
|
||||
MODELS_ROOT_DIR="$(pwd)/models"
|
||||
else
|
||||
MODELS_ROOT_DIR="$(readlink -m "${MODELS_ROOT_DIR}")"
|
||||
fi
|
||||
mkdir -p "$MODELS_ROOT_DIR"
|
||||
|
||||
# Write .env
|
||||
echo "NUM_GPUS=${NUM_GPUS}" >> .env
|
||||
echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
|
||||
echo "API_EXTERNAL_PORT=${API_EXTERNAL_PORT}" >> .env
|
||||
echo "TRITON_HOST=${TRITON_HOST}" >> .env
|
||||
echo "TRITON_PORT=${TRITON_PORT}" >> .env
|
||||
|
||||
############### Backend specific configuration ###############
|
||||
|
||||
function fastertransformer_backend(){
|
||||
echo "Models available:"
|
||||
echo "[1] codegen-350M-mono (2GB total VRAM required; Python-only)"
|
||||
echo "[2] codegen-350M-multi (2GB total VRAM required; multi-language)"
|
||||
|
@ -49,38 +82,11 @@ case $MODEL_NUM in
|
|||
*) MODEL="codegen-6B-multi" ;;
|
||||
esac
|
||||
|
||||
# Read number of GPUs
|
||||
read -rp "Enter number of GPUs [1]: " NUM_GPUS
|
||||
NUM_GPUS=${NUM_GPUS:-1}
|
||||
echo "MODEL=${MODEL}" >> .env
|
||||
echo "MODEL_DIR=${MODELS_ROOT_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
|
||||
|
||||
read -rp "External port for the API [5000]: " API_EXTERNAL_PORT
|
||||
API_EXTERNAL_PORT=${API_EXTERNAL_PORT:-5000}
|
||||
|
||||
read -rp "Address for Triton [triton]: " TRITON_HOST
|
||||
TRITON_HOST=${TRITON_HOST:-triton}
|
||||
|
||||
read -rp "Port of Triton host [8001]: " TRITON_PORT
|
||||
TRITON_PORT=${TRITON_PORT:-8001}
|
||||
|
||||
# Read model directory
|
||||
read -rp "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
|
||||
if [ -z "$MODEL_DIR" ]; then
|
||||
MODEL_DIR="$(pwd)/models"
|
||||
else
|
||||
MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
|
||||
fi
|
||||
|
||||
# Write .env
|
||||
echo "MODEL=${MODEL}" > .env
|
||||
echo "NUM_GPUS=${NUM_GPUS}" >> .env
|
||||
echo "MODEL_DIR=${MODEL_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
|
||||
echo "API_EXTERNAL_PORT=${API_EXTERNAL_PORT}" >> .env
|
||||
echo "TRITON_HOST=${TRITON_HOST}" >> .env
|
||||
echo "TRITON_PORT=${TRITON_PORT}" >> .env
|
||||
echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
|
||||
|
||||
if (test -d "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
|
||||
echo "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||
if (test -d "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
|
||||
echo "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||
echo "Converted model for ${MODEL}-${NUM_GPUS}gpu already exists."
|
||||
read -rp "Do you want to re-use it? y/n: " REUSE_CHOICE
|
||||
if [[ ${REUSE_CHOICE:-y} =~ ^[Yy]$ ]]
|
||||
|
@ -89,7 +95,7 @@ if (test -d "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
|
|||
echo "Re-using model"
|
||||
else
|
||||
DOWNLOAD_MODEL=y
|
||||
rm -rf "$MODEL_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||
rm -rf "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||
fi
|
||||
else
|
||||
DOWNLOAD_MODEL=y
|
||||
|
@ -97,23 +103,80 @@ fi
|
|||
|
||||
if [[ ${DOWNLOAD_MODEL:-y} =~ ^[Yy]$ ]]
|
||||
then
|
||||
# Create model directory
|
||||
mkdir -p "${MODEL_DIR}"
|
||||
if [ "$NUM_GPUS" -le 2 ]; then
|
||||
echo "Downloading the model from HuggingFace, this will take a while..."
|
||||
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
||||
DEST="${MODEL}-${NUM_GPUS}gpu"
|
||||
ARCHIVE="${MODEL_DIR}/${DEST}.tar.zst"
|
||||
cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODEL_DIR}"
|
||||
ARCHIVE="${MODELS_ROOT_DIR}/${DEST}.tar.zst"
|
||||
cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODELS_ROOT_DIR}"
|
||||
curl -L "https://huggingface.co/moyix/${MODEL}-gptj/resolve/main/${MODEL}-${NUM_GPUS}gpu.tar.zst" \
|
||||
-o "$ARCHIVE"
|
||||
zstd -dc "$ARCHIVE" | tar -xf - -C "${MODEL_DIR}"
|
||||
zstd -dc "$ARCHIVE" | tar -xf - -C "${MODELS_ROOT_DIR}"
|
||||
rm -f "$ARCHIVE"
|
||||
else
|
||||
echo "Downloading and converting the model, this will take a while..."
|
||||
docker run --rm -v "${MODEL_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
|
||||
docker run --rm -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
function python_backend(){
|
||||
echo "Models available:"
|
||||
echo "[1] codegen-350M-mono (1GB total VRAM required; Python-only)"
|
||||
echo "[2] codegen-350M-multi (1GB total VRAM required; multi-language)"
|
||||
echo "[3] codegen-2B-mono (4GB total VRAM required; Python-only)"
|
||||
echo "[4] codegen-2B-multi (4GB total VRAM required; multi-language)"
|
||||
|
||||
read -rp "Enter your choice [4]: " MODEL_NUM
|
||||
|
||||
# Convert model number to model name
|
||||
case $MODEL_NUM in
|
||||
1) MODEL="codegen-350M-mono"; ORG="Salesforce" ;;
|
||||
2) MODEL="codegen-350M-multi"; ORG="Salesforce" ;;
|
||||
3) MODEL="codegen-2B-mono"; ORG="Salesforce" ;;
|
||||
4) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
|
||||
*) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
|
||||
esac
|
||||
|
||||
# share huggingface cache? Should be safe to share, but permission issues may arise depending upon your docker setup
|
||||
read -rp "Do you want to share your huggingface cache between host and docker container? y/n [n]: " SHARE_HF_CACHE
|
||||
SHARE_HF_CACHE=${SHARE_HF_CACHE:-n}
|
||||
if [[ ${SHARE_HF_CACHE:-y} =~ ^[Yy]$ ]]; then
|
||||
read -rp "Enter your huggingface cache directory [$HOME/.cache/huggingface]: " HF_CACHE_DIR
|
||||
HF_CACHE_DIR=${HF_CACHE_DIR:-$HOME/.cache/huggingface}
|
||||
else
|
||||
HF_CACHE_DIR="$(pwd)/.hf_cache"
|
||||
fi
|
||||
|
||||
# use int8? Allows larger models to fit in GPU but might be very marginally slower
|
||||
read -rp "Do you want to use int8? y/n [y]: " USE_INT8
|
||||
if [[ ${USE_INT8:-y} =~ ^[Nn]$ ]]; then
|
||||
USE_INT8="0"
|
||||
else
|
||||
USE_INT8="1"
|
||||
fi
|
||||
|
||||
# Write config.env
|
||||
echo "MODEL=py-${MODEL}" >> .env
|
||||
echo "MODEL_DIR=${MODELS_ROOT_DIR}/py-${ORG}-${MODEL}" >> .env # different format from fastertransformer backend
|
||||
echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env
|
||||
|
||||
python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODELS_ROOT_DIR}" --use_int8 "${USE_INT8}"
|
||||
}
|
||||
|
||||
common_config
|
||||
|
||||
# choose backend
|
||||
echo "Choose your backend:"
|
||||
echo "[1] FasterTransformer backend (faster, but limited models)"
|
||||
echo "[2] Python backend (slower, but more models, and allows loading with int8)"
|
||||
read -rp "Enter your choice [1]: " BACKEND_NUM
|
||||
|
||||
if [ "$BACKEND_NUM" -eq 2 ]; then
|
||||
python_backend
|
||||
else
|
||||
fastertransformer_backend
|
||||
fi
|
||||
|
||||
read -rp "Config complete, do you want to run FauxPilot? [y/n] " RUN
|
||||
if [[ ${RUN:-y} =~ ^[Yy]$ ]]
|
||||
|
|
2
tests/python_backend/.gitignore
vendored
Normal file
2
tests/python_backend/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
test.env
|
||||
models/*
|
35
tests/python_backend/docker-compose-with-gpus.yaml
Normal file
35
tests/python_backend/docker-compose-with-gpus.yaml
Normal file
|
@ -0,0 +1,35 @@
|
|||
version: '3.3'
|
||||
services:
|
||||
triton:
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: Dockerfile
|
||||
command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||
shm_size: '2gb'
|
||||
volumes:
|
||||
- ${MODEL_DIR}:/model
|
||||
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "8001:8001"
|
||||
- "8002:8002"
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
copilot_proxy:
|
||||
# For dockerhub version
|
||||
# image: moyix/copilot_proxy:latest
|
||||
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
||||
# For local build
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: copilot_proxy/Dockerfile
|
||||
env_file:
|
||||
# Automatically created via ./setup.sh
|
||||
- test.env
|
||||
ports:
|
||||
- "${API_EXTERNAL_PORT}:5000"
|
28
tests/python_backend/docker-compose-without-gpus.yaml
Normal file
28
tests/python_backend/docker-compose-without-gpus.yaml
Normal file
|
@ -0,0 +1,28 @@
|
|||
version: '3.3'
|
||||
services:
|
||||
triton:
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: Dockerfile
|
||||
command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||
shm_size: '2gb'
|
||||
volumes:
|
||||
- ${MODEL_DIR}:/model
|
||||
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "8001:8001"
|
||||
- "8002:8002"
|
||||
copilot_proxy:
|
||||
# For dockerhub version
|
||||
# image: moyix/copilot_proxy:latest
|
||||
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
||||
# For local build
|
||||
build:
|
||||
context: ../../
|
||||
dockerfile: copilot_proxy/Dockerfile
|
||||
env_file:
|
||||
# Automatically created via ./setup.sh
|
||||
- test.env
|
||||
ports:
|
||||
- "${API_EXTERNAL_PORT}:5000"
|
163
tests/python_backend/test_setup.py
Normal file
163
tests/python_backend/test_setup.py
Normal file
|
@ -0,0 +1,163 @@
|
|||
"Tests setup script (currently for Python backend)"
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import signal
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, Union
|
||||
|
||||
import pexpect
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
curdir = Path(__file__).parent
|
||||
root = curdir.parent.parent
|
||||
|
||||
test_models_dir = curdir/"models"
|
||||
|
||||
|
||||
def setup_module():
|
||||
"Setup steps for tests in this module"
|
||||
assert (root/"setup.sh").exists(), "setup.sh not found"
|
||||
if (root/".env").exists():
|
||||
shutil.move(str(root/".env"), str(root/".env.bak"))
|
||||
|
||||
def teardown_module():
|
||||
"Teardown steps for tests in this module"
|
||||
if (root/".env.bak").exists():
|
||||
shutil.move(str(root/".env.bak"), str(root/".env"))
|
||||
try:
|
||||
if test_models_dir.exists():
|
||||
shutil.rmtree(test_models_dir)
|
||||
except Exception as exc:
|
||||
print(
|
||||
f"WARNING: Couldn't delete `{test_models_dir}` most likely due to permission issues."
|
||||
f"Run the tests with sudo to ensure this gets deleted automatically, or else delete manually. Exception: {exc}"
|
||||
)
|
||||
|
||||
def enter_input(proc: pexpect.spawn, expect: str, input_s: str, timeout: int = 5) -> str:
|
||||
"Helper function to enter input for a given prompt. Returns consumed output."
|
||||
|
||||
try:
|
||||
proc.expect(expect, timeout=timeout)
|
||||
except pexpect.exceptions.TIMEOUT as exc:
|
||||
raise AssertionError(
|
||||
f"Timeout waiting for prompt: `{expect}`.\n"
|
||||
f"Output-before: `{proc.before}`\nOutput-after: `{proc.after}`"
|
||||
) from exc
|
||||
|
||||
after = str(proc.after)
|
||||
print(after)
|
||||
proc.sendline(input_s)
|
||||
return after
|
||||
|
||||
def run_common_setup_steps(n_gpus: int = 0) -> pexpect.spawn:
|
||||
"Helper function to run common setup steps."
|
||||
proc = pexpect.pty_spawn.spawn(
|
||||
"./setup.sh 2>&1", encoding="utf-8", cwd=str(root),
|
||||
)
|
||||
proc.ignorecase = True
|
||||
|
||||
enter_input(proc, r".*Enter number of GPUs[^:]+: ?", str(n_gpus))
|
||||
enter_input(proc, r".*port for the API[^:]+: ?", "5000")
|
||||
enter_input(proc, r".*Address for Triton[^:]+: ?", "triton")
|
||||
enter_input(proc, r".*Port of Triton[^:]+: ?", "8001")
|
||||
enter_input(proc, r".*save your models[^\?]+\? ?", str(test_models_dir.absolute()))
|
||||
|
||||
return proc
|
||||
|
||||
def load_test_env():
|
||||
"Load test env vars"
|
||||
# Without loading default env vars, PATH won't be set correctly
|
||||
env = os.environ.copy()
|
||||
with open(curdir/"test.env", "r", encoding="utf8") as test_env:
|
||||
for line in test_env:
|
||||
key, val = line.strip().split("=")
|
||||
env[key] = val
|
||||
return env
|
||||
|
||||
def run_inference(
|
||||
prompt: str, model_name: str = "py-model", port: int = 5000, return_all: bool = False,
|
||||
**kwargs
|
||||
) -> Union[str, Dict]:
|
||||
"Invokes the copilot proxy with the given prompt and returns the completion"
|
||||
endpoint = f"http://localhost:{port}/v1/engines/codegen/completions"
|
||||
data = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"suffix": kwargs.get("suffix", ""),
|
||||
"max_tokens": kwargs.get("max_tokens", 16),
|
||||
"temperature": kwargs.get("temperature", 0.0),
|
||||
"top_p": kwargs.get("top_p", 1.0),
|
||||
"n": kwargs.get("n", 1),
|
||||
"stream": kwargs.get("stream", None), # it's not true/false. It's None or not None :[
|
||||
"logprobs": kwargs.get("logprobs", 0),
|
||||
"stop": kwargs.get("stop", ""),
|
||||
"echo": kwargs.get("echo", True),
|
||||
"presence_penalty": kwargs.get("presence_penalty", 0.0),
|
||||
"frequency_penalty": kwargs.get("frequency_penalty", 0.0),
|
||||
"best_of": kwargs.get("best_of", 1),
|
||||
"logit_bias": kwargs.get("logit_bias", {}),
|
||||
"user": kwargs.get("user", "test"),
|
||||
}
|
||||
|
||||
response = requests.post(endpoint, json=data)
|
||||
response.raise_for_status()
|
||||
|
||||
if return_all:
|
||||
return response.json()
|
||||
return response.json()["choices"][0]["text"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_gpus", [0]) # we don't have a GPU on CI
|
||||
def test_python_backend(n_gpus: int):
|
||||
"""
|
||||
Step 1: run $root/setup.sh while passing appropriate options via stdin
|
||||
Step 2: run docker-compose up with test.env sourced
|
||||
Step 3: call :5000 with appropriate request
|
||||
"""
|
||||
proc = run_common_setup_steps(n_gpus)
|
||||
|
||||
choices = enter_input(proc, r".*Choose your backend.*Enter your choice[^:]+: ?", "2")
|
||||
assert "[2] Python backend" in choices, "Option 2 should be Python backend"
|
||||
|
||||
choices = enter_input(proc, r".*Models available:.*Enter your choice[^:]+: ?", "1")
|
||||
assert "[1] codegen-350M-mono" in choices, "Option 1 should be codegen-350M-mono"
|
||||
|
||||
enter_input(proc, r".*share (your )?huggingface cache[^:]+: ?", "y")
|
||||
enter_input(proc, r".*cache directory[^:]+: ?", "") # default
|
||||
enter_input(proc, r".*use int8[^:]+: ?", "n")
|
||||
enter_input(proc, r".*run FauxPilot\? \[y/n\] ", "n")
|
||||
|
||||
# copy $root/.env to $curdir/test.env
|
||||
shutil.copy(str(root/".env"), str(curdir/"test.env"))
|
||||
|
||||
# run docker-compose up -f docker-compose-{without|with}-gpus.yml
|
||||
compose_file = f"docker-compose-with{'' if n_gpus > 0 else 'out'}-gpus.yaml"
|
||||
docker_proc = None
|
||||
try:
|
||||
docker_proc = pexpect.pty_spawn.spawn(
|
||||
f"docker-compose -f {compose_file} up",
|
||||
encoding="utf-8",
|
||||
cwd=curdir,
|
||||
env=load_test_env(),
|
||||
)
|
||||
|
||||
print("Waiting for API to be ready...")
|
||||
docker_proc.expect(r".*Started GRPCInferenceService at 0.0.0.0:8001", timeout=120)
|
||||
|
||||
print("API ready, sending request...")
|
||||
|
||||
# Simple test 1: hello world prompt without bells and whistles
|
||||
response = run_inference("def hello_world():\n", max_tokens=16, return_all=True)
|
||||
assert response["choices"][0]["text"].rstrip() == ' print("Hello World")\n\nhello_world()\n\n#'
|
||||
assert response["choices"][0]["finish_reason"] == "length"
|
||||
|
||||
finally:
|
||||
if docker_proc is not None and docker_proc.isalive():
|
||||
docker_proc.kill(signal.SIGINT)
|
||||
|
||||
# killing docker-compose process doesn't bring down the containers.
|
||||
# explicitly stop the containers:
|
||||
subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True, env=load_test_env())
|
Loading…
Add table
Add a link
Reference in a new issue