Add python backend support

- Modify dockerfile to include bitsandbytes, transformers and latest version of pytorch
- Minor modifications in utils/codegen.py so that same client works with FT and Py-backend
- Minor modifications in launch.sh (no need to name models by GPU)
- Add installation script for adding a new python model (with super simple config_template)
- Modify setup.sh so that it aworks with both FT and Python backend models

Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
This commit is contained in:
Parth Thakkar 2022-10-16 22:05:00 -05:00
parent 9b2bc84670
commit 01f1cbb629
9 changed files with 487 additions and 73 deletions

View file

@ -70,17 +70,21 @@ class CodeGenProxy:
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
def generate(self, data):
model_name = "fastertransformer"
prompt = data['prompt']
n = data.get('n', 1)
model_name = data["model"]
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
np_type = np.int32 if model_name.startswith("py-") else np.uint32
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
prompt_len = input_start_ids.shape[1]
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
max_tokens = data.get('max_tokens', 16)
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
raise ValueError("Max tokens + prompt length exceeds maximum model length")
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
num_logprobs = data.get('logprobs', -1)
if num_logprobs is None:
num_logprobs = 1
@ -95,7 +99,7 @@ class CodeGenProxy:
top_p = data.get('top_p', 1.0)
frequency_penalty = data.get('frequency_penalty', 1.0)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
@ -103,9 +107,9 @@ class CodeGenProxy:
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
stop_words = data.get('stop', [])
if stop_words is None: