Add python backend support

- Modify dockerfile to include bitsandbytes, transformers and latest version of pytorch
- Minor modifications in utils/codegen.py so that same client works with FT and Py-backend
- Minor modifications in launch.sh (no need to name models by GPU)
- Add installation script for adding a new python model (with super simple config_template)
- Modify setup.sh so that it aworks with both FT and Python backend models

Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
This commit is contained in:
Parth Thakkar 2022-10-16 22:05:00 -05:00
parent 9b2bc84670
commit 01f1cbb629
9 changed files with 487 additions and 73 deletions

View file

@ -0,0 +1,180 @@
name: "py-model"
backend: "python"
max_batch_size: 4
input [
{
name: "input_ids"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
# UNUSED
name: "start_id"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "end_id"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "input_lengths"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_INT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "random_seed"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "beam_width"
data_type: TYPE_INT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
# UNUSED
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
# UNUSED
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_INT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_INT32
dims: [ -1 ]
} #,
# Following is currently unsupported, but should be supported in the future
# {
# name: "cum_log_probs"
# data_type: TYPE_FP32
# dims: [ -1 ]
# },
# {
# name: "output_log_probs"
# data_type: TYPE_FP32
# dims: [ -1, -1 ]
# }
]
# unsure what this is for
instance_group [
{
count: 1
kind: KIND_CPU
}
]
parameters {
key: "use_half"
value: {
string_value: "1"
}
}
parameters {
key: "model_name"
value: {
string_value: "${model_name}" # e.g. "codegen-350M-multi"
}
}
parameters {
key: "org_name"
value: {
string_value: "${org_name}" # e.g. "Salesforce"
}
}
parameters {
key: "use_int8",
value: {
string_value: "${use_int8}" # e.g. "0" or "1"
}
}
parameters {
key: "use_auto_device_map",
value: {
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
}
}

View file

@ -0,0 +1,42 @@
"""
A simple script that sets up the model directory of a given model for Triton.
"""
import argparse
import shutil
from pathlib import Path
from string import Template
SCRIPT_DIR = Path(__file__).parent
CONFIG_TEMPLATE_PATH = SCRIPT_DIR/'config_template.pbtxt'
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True)
parser.add_argument("--model_name", type=str, required=True)
parser.add_argument("--org_name", type=str, required=True)
parser.add_argument("--use_half", type=str, default="1")
parser.add_argument("--use_int8", type=str, default="0")
parser.add_argument("--use_auto_device_map", type=str, default="1")
args = parser.parse_args()
# Step1: Make model directory
model_dir_path = Path(args.model_dir)/f"py-{args.model_name}/py-model/1"
model_dir_path.mkdir(parents=True, exist_ok=True)
# Step 2: copy model.py
shutil.copy(SCRIPT_DIR/'model.py', model_dir_path/'model.py')
# Step 3: Generate config.pbtxt
with open(CONFIG_TEMPLATE_PATH, 'r') as f:
template = Template(f.read())
config = template.substitute(
org_name=args.org_name,
model_name=args.model_name,
use_half=args.use_half,
use_int8=args.use_int8,
use_auto_device_map=args.use_auto_device_map,
)
with open(model_dir_path/'../config.pbtxt', 'w') as f:
f.write(config)

91
python_backend/model.py Normal file
View file

@ -0,0 +1,91 @@
import json
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import to_dlpack, from_dlpack
import torch
def pb2torch(request, name):
tensor = pb_utils.get_input_tensor_by_name(request, name)
return from_dlpack(tensor.to_dlpack())
def torch2pb(name, tensor):
return pb_utils.Tensor.from_dlpack(name, to_dlpack(tensor))
class TritonPythonModel:
def initialize(self, args):
self.model_config = model_config = json.loads(args["model_config"])
org_name = model_config["parameters"].get("org_name", {"string_value": "Salesforce"})["string_value"]
model_name = org_name + "/" + model_config["parameters"]["model_name"]["string_value"]
get_bool = lambda x: model_config["parameters"][x]["string_value"].lower() in ["1", "true"]
is_half = get_bool("use_half")
int8 = get_bool("use_int8") # this will make inference marginally slower, but will allow bigger models to fit in GPU
auto_device_map = get_bool("use_auto_device_map")
print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if is_half else "auto",
load_in_8bit=int8,
device_map="auto" if auto_device_map else None,
low_cpu_mem_usage=True,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Model {model_name} Loaded. Footprint: {self.model.get_memory_footprint()}")
# set max_batch_size
self.max_batch_size = 0 # model_config["max_batch_size"]
def execute(self, requests):
# TODO: don't just loop over requests. batch them up
responses = []
for request in requests:
input_ids_torch = pb2torch(request, "input_ids")
input_lengths_torch = pb2torch(request, "input_lengths")
request_output_len_torch = pb2torch(request, "request_output_len")
# Attention mask
attention_mask = None
if input_lengths_torch.min() != input_lengths_torch.max():
attention_mask = torch.zeros(input_ids_torch.shape, dtype=torch.long)
for i, l in enumerate(input_lengths_torch):
attention_mask[i, :l] = 1
# Output length
max_new_tokens = request_output_len_torch[0][0]
top_k = pb_utils.get_input_tensor_by_name(request, "runtime_top_k").as_numpy().tolist()[0]
top_p = pb_utils.get_input_tensor_by_name(request, "runtime_top_p").as_numpy().tolist()[0]
temperature = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy().tolist()[0]
# n_samples = pb_utils.get_input_tensor_by_name(request, "n")
n_samples = 1 # TODO: client doesn't send this yet. instead it duplicates the request n times
# Generate
output_ids = self.model.generate(
input_ids=input_ids_torch, attention_mask=attention_mask,
max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=n_samples,
temperature=temperature,
)
# assert len(output_ids.shape) == 2, "huggingface format is batch x seq_len"
# assert output_ids.shape[0] == input_ids_torch.shape[0], "expecting batch size to match input"
output_ids = output_ids.unsqueeze(1) # client wants batch x beam_width x seq_len and we don't support beam_width yet
# create output tensors
out_tensor_pb = torch2pb("output_ids", output_ids)
# calculate sequence_length
sequence_length = torch.zeros(output_ids.shape[:2], dtype=torch.int32)
for i in range(output_ids.shape[0]):
sequence_length[i, 0] = torch.sum(output_ids[i, 0] != self.model.config.eos_token_id).item()
sequence_length_pb = torch2pb("sequence_length", sequence_length)
# create response
response = pb_utils.InferenceResponse([out_tensor_pb, sequence_length_pb])
responses.append(response)
return responses