mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-07-16 10:03:25 -07:00
Add python backend support
- Modify dockerfile to include bitsandbytes, transformers and latest version of pytorch - Minor modifications in utils/codegen.py so that same client works with FT and Py-backend - Minor modifications in launch.sh (no need to name models by GPU) - Add installation script for adding a new python model (with super simple config_template) - Modify setup.sh so that it aworks with both FT and Python backend models Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
This commit is contained in:
parent
9b2bc84670
commit
01f1cbb629
9 changed files with 487 additions and 73 deletions
5
Dockerfile
Normal file
5
Dockerfile
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
FROM moyix/triton_with_ft:22.09
|
||||||
|
|
||||||
|
# Install dependencies: torch
|
||||||
|
RUN pip3 install -U torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
|
RUN pip3 install -U transformers bitsandbytes accelerate
|
|
@ -4,7 +4,7 @@ from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class OpenAIinput(BaseModel):
|
class OpenAIinput(BaseModel):
|
||||||
model: str
|
model: str = "fastertransformer|py-model"
|
||||||
prompt: Optional[str]
|
prompt: Optional[str]
|
||||||
suffix: Optional[str]
|
suffix: Optional[str]
|
||||||
max_tokens: Optional[int] = 16
|
max_tokens: Optional[int] = 16
|
||||||
|
|
|
@ -70,17 +70,21 @@ class CodeGenProxy:
|
||||||
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
||||||
|
|
||||||
def generate(self, data):
|
def generate(self, data):
|
||||||
model_name = "fastertransformer"
|
|
||||||
prompt = data['prompt']
|
prompt = data['prompt']
|
||||||
n = data.get('n', 1)
|
n = data.get('n', 1)
|
||||||
|
model_name = data["model"]
|
||||||
|
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
||||||
|
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
||||||
|
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
||||||
|
|
||||||
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
||||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
|
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
||||||
prompt_len = input_start_ids.shape[1]
|
prompt_len = input_start_ids.shape[1]
|
||||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
max_tokens = data.get('max_tokens', 16)
|
max_tokens = data.get('max_tokens', 16)
|
||||||
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
||||||
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
||||||
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
|
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||||
num_logprobs = data.get('logprobs', -1)
|
num_logprobs = data.get('logprobs', -1)
|
||||||
if num_logprobs is None:
|
if num_logprobs is None:
|
||||||
num_logprobs = 1
|
num_logprobs = 1
|
||||||
|
@ -95,7 +99,7 @@ class CodeGenProxy:
|
||||||
|
|
||||||
top_p = data.get('top_p', 1.0)
|
top_p = data.get('top_p', 1.0)
|
||||||
frequency_penalty = data.get('frequency_penalty', 1.0)
|
frequency_penalty = data.get('frequency_penalty', 1.0)
|
||||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
||||||
|
@ -103,9 +107,9 @@ class CodeGenProxy:
|
||||||
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
||||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
|
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
||||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
|
|
||||||
stop_words = data.get('stop', [])
|
stop_words = data.get('stop', [])
|
||||||
if stop_words is None:
|
if stop_words is None:
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
version: '3.3'
|
version: '3.3'
|
||||||
services:
|
services:
|
||||||
triton:
|
triton:
|
||||||
image: moyix/triton_with_ft:22.09
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||||
shm_size: '2gb'
|
shm_size: '2gb'
|
||||||
volumes:
|
volumes:
|
||||||
- ${MODEL_DIR}:/model
|
- ${MODEL_DIR}:/model
|
||||||
|
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
- "8001:8001"
|
- "8001:8001"
|
||||||
|
|
|
@ -8,8 +8,16 @@ fi
|
||||||
source config.env
|
source config.env
|
||||||
|
|
||||||
export NUM_GPUS=${NUM_GPUS}
|
export NUM_GPUS=${NUM_GPUS}
|
||||||
|
|
||||||
|
# if model name starts with "py-", it means we're dealing with the python backend.
|
||||||
|
if [[ $(echo "$MODEL" | cut -c1-3) == "py-" ]]; then
|
||||||
|
export MODEL_DIR="${MODEL_DIR}"/"${MODEL}" #/py_model"
|
||||||
|
else
|
||||||
export MODEL_DIR="${MODEL_DIR}"/"${MODEL}-${NUM_GPUS}gpu"
|
export MODEL_DIR="${MODEL_DIR}"/"${MODEL}-${NUM_GPUS}gpu"
|
||||||
|
fi
|
||||||
|
|
||||||
export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
|
export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
|
||||||
|
export HF_CACHE_DIR=${HF_CACHE_DIR}
|
||||||
|
|
||||||
# On newer versions, docker-compose is docker compose
|
# On newer versions, docker-compose is docker compose
|
||||||
if command -v docker-compose > /dev/null; then
|
if command -v docker-compose > /dev/null; then
|
||||||
|
|
180
python_backend/config_template.pbtxt
Normal file
180
python_backend/config_template.pbtxt
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
name: "py-model"
|
||||||
|
backend: "python"
|
||||||
|
max_batch_size: 4
|
||||||
|
input [
|
||||||
|
{
|
||||||
|
name: "input_ids"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "start_id"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "end_id"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "input_lengths"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "request_output_len"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "runtime_top_k"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "runtime_top_p"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "beam_search_diversity_rate"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "temperature"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "len_penalty"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "repetition_penalty"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "random_seed"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "is_return_log_probs"
|
||||||
|
data_type: TYPE_BOOL
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "beam_width"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "bad_words_list"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 2, -1 ]
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "stop_words_list"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 2, -1 ]
|
||||||
|
optional: true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
output [
|
||||||
|
{
|
||||||
|
name: "output_ids"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1, -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "sequence_length"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1 ]
|
||||||
|
} #,
|
||||||
|
# Following is currently unsupported, but should be supported in the future
|
||||||
|
# {
|
||||||
|
# name: "cum_log_probs"
|
||||||
|
# data_type: TYPE_FP32
|
||||||
|
# dims: [ -1 ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# name: "output_log_probs"
|
||||||
|
# data_type: TYPE_FP32
|
||||||
|
# dims: [ -1, -1 ]
|
||||||
|
# }
|
||||||
|
]
|
||||||
|
# unsure what this is for
|
||||||
|
instance_group [
|
||||||
|
{
|
||||||
|
count: 1
|
||||||
|
kind: KIND_CPU
|
||||||
|
}
|
||||||
|
]
|
||||||
|
parameters {
|
||||||
|
key: "use_half"
|
||||||
|
value: {
|
||||||
|
string_value: "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "model_name"
|
||||||
|
value: {
|
||||||
|
string_value: "${model_name}" # e.g. "codegen-350M-multi"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "org_name"
|
||||||
|
value: {
|
||||||
|
string_value: "${org_name}" # e.g. "Salesforce"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "use_int8",
|
||||||
|
value: {
|
||||||
|
string_value: "${use_int8}" # e.g. "0" or "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "use_auto_device_map",
|
||||||
|
value: {
|
||||||
|
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
|
||||||
|
}
|
||||||
|
}
|
42
python_backend/init_model.py
Normal file
42
python_backend/init_model.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
"""
|
||||||
|
A simple script that sets up the model directory of a given model for Triton.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from string import Template
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).parent
|
||||||
|
CONFIG_TEMPLATE_PATH = SCRIPT_DIR/'config_template.pbtxt'
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model_dir", type=str, required=True)
|
||||||
|
parser.add_argument("--model_name", type=str, required=True)
|
||||||
|
parser.add_argument("--org_name", type=str, required=True)
|
||||||
|
parser.add_argument("--use_half", type=str, default="1")
|
||||||
|
parser.add_argument("--use_int8", type=str, default="0")
|
||||||
|
parser.add_argument("--use_auto_device_map", type=str, default="1")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
# Step1: Make model directory
|
||||||
|
model_dir_path = Path(args.model_dir)/f"py-{args.model_name}/py-model/1"
|
||||||
|
model_dir_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Step 2: copy model.py
|
||||||
|
shutil.copy(SCRIPT_DIR/'model.py', model_dir_path/'model.py')
|
||||||
|
|
||||||
|
# Step 3: Generate config.pbtxt
|
||||||
|
with open(CONFIG_TEMPLATE_PATH, 'r') as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
|
||||||
|
config = template.substitute(
|
||||||
|
org_name=args.org_name,
|
||||||
|
model_name=args.model_name,
|
||||||
|
use_half=args.use_half,
|
||||||
|
use_int8=args.use_int8,
|
||||||
|
use_auto_device_map=args.use_auto_device_map,
|
||||||
|
)
|
||||||
|
with open(model_dir_path/'../config.pbtxt', 'w') as f:
|
||||||
|
f.write(config)
|
91
python_backend/model.py
Normal file
91
python_backend/model.py
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
import triton_python_backend_utils as pb_utils
|
||||||
|
from torch.utils.dlpack import to_dlpack, from_dlpack
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def pb2torch(request, name):
|
||||||
|
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
||||||
|
return from_dlpack(tensor.to_dlpack())
|
||||||
|
|
||||||
|
def torch2pb(name, tensor):
|
||||||
|
return pb_utils.Tensor.from_dlpack(name, to_dlpack(tensor))
|
||||||
|
|
||||||
|
class TritonPythonModel:
|
||||||
|
def initialize(self, args):
|
||||||
|
self.model_config = model_config = json.loads(args["model_config"])
|
||||||
|
org_name = model_config["parameters"].get("org_name", {"string_value": "Salesforce"})["string_value"]
|
||||||
|
model_name = org_name + "/" + model_config["parameters"]["model_name"]["string_value"]
|
||||||
|
|
||||||
|
get_bool = lambda x: model_config["parameters"][x]["string_value"].lower() in ["1", "true"]
|
||||||
|
|
||||||
|
is_half = get_bool("use_half")
|
||||||
|
int8 = get_bool("use_int8") # this will make inference marginally slower, but will allow bigger models to fit in GPU
|
||||||
|
auto_device_map = get_bool("use_auto_device_map")
|
||||||
|
|
||||||
|
print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.float16 if is_half else "auto",
|
||||||
|
load_in_8bit=int8,
|
||||||
|
device_map="auto" if auto_device_map else None,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
)
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
print(f"Model {model_name} Loaded. Footprint: {self.model.get_memory_footprint()}")
|
||||||
|
|
||||||
|
# set max_batch_size
|
||||||
|
self.max_batch_size = 0 # model_config["max_batch_size"]
|
||||||
|
|
||||||
|
def execute(self, requests):
|
||||||
|
# TODO: don't just loop over requests. batch them up
|
||||||
|
|
||||||
|
responses = []
|
||||||
|
|
||||||
|
for request in requests:
|
||||||
|
input_ids_torch = pb2torch(request, "input_ids")
|
||||||
|
input_lengths_torch = pb2torch(request, "input_lengths")
|
||||||
|
request_output_len_torch = pb2torch(request, "request_output_len")
|
||||||
|
|
||||||
|
# Attention mask
|
||||||
|
attention_mask = None
|
||||||
|
if input_lengths_torch.min() != input_lengths_torch.max():
|
||||||
|
attention_mask = torch.zeros(input_ids_torch.shape, dtype=torch.long)
|
||||||
|
for i, l in enumerate(input_lengths_torch):
|
||||||
|
attention_mask[i, :l] = 1
|
||||||
|
|
||||||
|
# Output length
|
||||||
|
max_new_tokens = request_output_len_torch[0][0]
|
||||||
|
|
||||||
|
top_k = pb_utils.get_input_tensor_by_name(request, "runtime_top_k").as_numpy().tolist()[0]
|
||||||
|
top_p = pb_utils.get_input_tensor_by_name(request, "runtime_top_p").as_numpy().tolist()[0]
|
||||||
|
temperature = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy().tolist()[0]
|
||||||
|
# n_samples = pb_utils.get_input_tensor_by_name(request, "n")
|
||||||
|
n_samples = 1 # TODO: client doesn't send this yet. instead it duplicates the request n times
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
output_ids = self.model.generate(
|
||||||
|
input_ids=input_ids_torch, attention_mask=attention_mask,
|
||||||
|
max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=n_samples,
|
||||||
|
temperature=temperature,
|
||||||
|
)
|
||||||
|
# assert len(output_ids.shape) == 2, "huggingface format is batch x seq_len"
|
||||||
|
# assert output_ids.shape[0] == input_ids_torch.shape[0], "expecting batch size to match input"
|
||||||
|
output_ids = output_ids.unsqueeze(1) # client wants batch x beam_width x seq_len and we don't support beam_width yet
|
||||||
|
|
||||||
|
# create output tensors
|
||||||
|
out_tensor_pb = torch2pb("output_ids", output_ids)
|
||||||
|
|
||||||
|
# calculate sequence_length
|
||||||
|
sequence_length = torch.zeros(output_ids.shape[:2], dtype=torch.int32)
|
||||||
|
for i in range(output_ids.shape[0]):
|
||||||
|
sequence_length[i, 0] = torch.sum(output_ids[i, 0] != self.model.config.eos_token_id).item()
|
||||||
|
sequence_length_pb = torch2pb("sequence_length", sequence_length)
|
||||||
|
|
||||||
|
# create response
|
||||||
|
response = pb_utils.InferenceResponse([out_tensor_pb, sequence_length_pb])
|
||||||
|
responses.append(response)
|
||||||
|
|
||||||
|
return responses
|
81
setup.sh
81
setup.sh
|
@ -18,6 +18,7 @@ check_dep zstd
|
||||||
check_dep docker
|
check_dep docker
|
||||||
|
|
||||||
|
|
||||||
|
function fastertransformer_backend(){
|
||||||
echo "Models available:"
|
echo "Models available:"
|
||||||
echo "[1] codegen-350M-mono (2GB total VRAM required; Python-only)"
|
echo "[1] codegen-350M-mono (2GB total VRAM required; Python-only)"
|
||||||
echo "[2] codegen-350M-multi (2GB total VRAM required; multi-language)"
|
echo "[2] codegen-350M-multi (2GB total VRAM required; multi-language)"
|
||||||
|
@ -87,3 +88,83 @@ else
|
||||||
docker run --rm -v ${MODEL_DIR}:/models -e MODEL=${MODEL} -e NUM_GPUS=${NUM_GPUS} moyix/model_converter:latest
|
docker run --rm -v ${MODEL_DIR}:/models -e MODEL=${MODEL} -e NUM_GPUS=${NUM_GPUS} moyix/model_converter:latest
|
||||||
fi
|
fi
|
||||||
echo "Done! Now run ./launch.sh to start the FauxPilot server."
|
echo "Done! Now run ./launch.sh to start the FauxPilot server."
|
||||||
|
}
|
||||||
|
|
||||||
|
function python_backend(){
|
||||||
|
echo "Models available:"
|
||||||
|
echo "[1] codegen-350M-mono (1GB total VRAM required; Python-only)"
|
||||||
|
echo "[2] codegen-350M-multi (1GB total VRAM required; multi-language)"
|
||||||
|
echo "[3] codegen-2B-mono (4GB total VRAM required; Python-only)"
|
||||||
|
echo "[4] codegen-2B-multi (4GB total VRAM required; multi-language)"
|
||||||
|
# echo "[5] codegen-6B-mono (13GB total VRAM required; Python-only)"
|
||||||
|
# echo "[6] codegen-6B-multi (13GB total VRAM required; multi-language)"
|
||||||
|
# echo "[7] codegen-16B-mono (32GB total VRAM required; Python-only)"
|
||||||
|
# echo "[8] codegen-16B-multi (32GB total VRAM required; multi-language)"
|
||||||
|
# Read their choice
|
||||||
|
read -p "Enter your choice [4]: " MODEL_NUM
|
||||||
|
|
||||||
|
# Convert model number to model name
|
||||||
|
case $MODEL_NUM in
|
||||||
|
1) MODEL="codegen-350M-mono"; ORG="Salesforce" ;;
|
||||||
|
2) MODEL="codegen-350M-multi"; ORG="Salesforce" ;;
|
||||||
|
3) MODEL="codegen-2B-mono"; ORG="Salesforce" ;;
|
||||||
|
4) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Read number of GPUs -- not strictly required for python backend, because of device_map="auto",
|
||||||
|
# but docker-compose.py uses it to select CUDA_VISIBLE_DEVICES
|
||||||
|
read -p "Enter number of GPUs [1]: " NUM_GPUS
|
||||||
|
NUM_GPUS=${NUM_GPUS:-1}
|
||||||
|
|
||||||
|
# Read model directory
|
||||||
|
read -p "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
|
||||||
|
MODEL_DIR=${MODEL_DIR:-$(pwd)/models}
|
||||||
|
if [ -z "$MODEL_DIR" ]; then
|
||||||
|
MODEL_DIR="$(pwd)/models"
|
||||||
|
else
|
||||||
|
MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# share huggingface cache? Should be safe to share, but permission issues may arise depending upon your docker setup
|
||||||
|
read -p "Do you want to share your huggingface cache between host and docker container? y/n [n]: " SHARE_HF_CACHE
|
||||||
|
SHARE_HF_CACHE=${SHARE_HF_CACHE:-n}
|
||||||
|
if [ "${SHARE_HF_CACHE^^}" = "Y" ]; then
|
||||||
|
read -p "Enter your huggingface cache directory [$HOME/.cache/huggingface]: " HF_CACHE_DIR
|
||||||
|
HF_CACHE_DIR=${HF_CACHE_DIR:-$HOME/.cache/huggingface}
|
||||||
|
else
|
||||||
|
HF_CACHE_DIR="/tmp/hf_cache"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# use int8? Allows larger models to fit in GPU but might be very marginally slower
|
||||||
|
read -p "Do you want to use int8? y/n [y]: " USE_INT8
|
||||||
|
USE_INT8=${USE_INT8:-y}
|
||||||
|
if [ "${USE_INT8^^}" = "N" ]; then
|
||||||
|
USE_INT8="0"
|
||||||
|
else
|
||||||
|
USE_INT8="1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write config.env
|
||||||
|
echo "MODEL=py-${MODEL}" > config.env
|
||||||
|
echo "NUM_GPUS=${NUM_GPUS}" >> config.env
|
||||||
|
echo "MODEL_DIR=${MODEL_DIR}" >> config.env
|
||||||
|
echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> config.env
|
||||||
|
|
||||||
|
# Create model directory
|
||||||
|
mkdir -p "${MODEL_DIR}/"
|
||||||
|
python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODEL_DIR}" --use_int8 "${USE_INT8}"
|
||||||
|
|
||||||
|
echo "Done! Now run ./launch.sh to start the FauxPilot server."
|
||||||
|
}
|
||||||
|
|
||||||
|
# choose backend
|
||||||
|
echo "Choose your backend:"
|
||||||
|
echo "[1] FasterTransformer backend (faster, but limited models)"
|
||||||
|
echo "[2] Python backend (slower, but more models, and allows loading with int8)"
|
||||||
|
read -p "Enter your choice [1]: " BACKEND_NUM
|
||||||
|
|
||||||
|
if [ $BACKEND_NUM -eq 2 ]; then
|
||||||
|
python_backend
|
||||||
|
else
|
||||||
|
fastertransformer_backend
|
||||||
|
fi
|
Loading…
Add table
Add a link
Reference in a new issue