Merge branch 'main' into main

This commit is contained in:
Fred de Gier 2023-02-13 16:07:45 +01:00 committed by GitHub
commit 7ebb56c551
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
37 changed files with 1156 additions and 121 deletions

View file

@ -10,4 +10,4 @@ COPY . .
EXPOSE 5000
CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]

View file

@ -1,11 +1,17 @@
import logging
import os
import uvicorn
from fastapi import FastAPI, Response
from fastapi import FastAPI, Request, Response
from fastapi.responses import JSONResponse
from sse_starlette.sse import EventSourceResponse
from config.log_config import uvicorn_logger
from models import OpenAIinput
from utils.codegen import CodeGenProxy
from utils.errors import FauxPilotException
logging.config.dictConfig(uvicorn_logger)
codegen = CodeGenProxy(
host=os.environ.get("TRITON_HOST", "triton"),
@ -21,24 +27,50 @@ app = FastAPI(
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
)
@app.exception_handler(FauxPilotException)
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
return JSONResponse(
status_code=400,
content=exc.json()
)
@app.post("/v1/engines/codegen/completions", status_code=200)
@app.post("/v1/completions", status_code=200)
# Used to support copilot.vim
@app.get("/copilot_internal/v2/token")
def get_copilot_token():
content = {'token': '1', 'expires_at': 2600000000, 'refresh_in': 900}
return JSONResponse(
status_code=200,
content=content
)
@app.post("/v1/engines/codegen/completions")
# Used to support copilot.vim
@app.post("/v1/engines/copilot-codex/completions")
@app.post("/v1/completions")
async def completions(data: OpenAIinput):
data = data.dict()
print(data)
try:
content = codegen(data=data)
except codegen.TokensExceedsMaximum as E:
raise FauxPilotException(
message=str(E),
type="invalid_request_error",
param=None,
code=None,
)
if data.get("stream") is not None:
return EventSourceResponse(
content=codegen(data=data),
content=content,
status_code=200,
media_type="text/event-stream"
)
else:
return Response(
status_code=200,
content=codegen(data=data),
content=content,
media_type="application/json"
)
if __name__ == "__main__":
uvicorn.run("app:app", host=os.environ.get("API_HOST", "0.0.0.0"), port=os.environ.get("API_PORT", 5000))
uvicorn.run("app:app", host="0.0.0.0", port=5000)

View file

View file

@ -0,0 +1,27 @@
# The uvicorn_logger is used to add timestamps
uvicorn_logger = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"access": {
"()": "uvicorn.logging.AccessFormatter",
"fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
"use_colors": True
},
},
"handlers": {
"access": {
"formatter": "access",
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout",
},
},
"loggers": {
"uvicorn.access": {
"handlers": ["access"],
# "level": "INFO",
"propagate": False
},
},
}

View file

@ -1,10 +1,10 @@
from typing import Optional, Union
from pydantic import BaseModel
from pydantic import BaseModel, constr
class OpenAIinput(BaseModel):
model: str
model: constr(regex="^(fastertransformer|py-model)$") = "fastertransformer"
prompt: Optional[str]
suffix: Optional[str]
max_tokens: Optional[int] = 16

View file

@ -6,14 +6,14 @@ import time
import numpy as np
import tritonclient.grpc as client_util
from tokenizers import Tokenizer
from tritonclient.utils import np_to_triton_dtype
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
np.finfo(np.dtype("float32"))
np.finfo(np.dtype("float64"))
class CodeGenProxy:
def __init__(self, host: str = 'localhost', port: int = 8001, verbose: bool = False):
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
self.PAD_CHAR = 50256
@ -21,6 +21,9 @@ class CodeGenProxy:
# Max number of tokens the model can handle
self.MAX_MODEL_LEN = 2048
class TokensExceedsMaximum(Exception):
pass
@staticmethod
def prepare_tensor(name: str, tensor_input):
t = client_util.InferInput(
@ -70,20 +73,31 @@ class CodeGenProxy:
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
def generate(self, data):
model_name = "fastertransformer"
prompt = data['prompt']
n = data.get('n', 1)
model_name = data["model"]
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
np_type = np.int32 if model_name.startswith("py-") else np.uint32
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
prompt_len = input_start_ids.shape[1]
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
max_tokens = data.get('max_tokens', 16)
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
raise ValueError("Max tokens + prompt length exceeds maximum model length")
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
prompt_tokens: int = input_len[0][0]
requested_tokens = max_tokens + prompt_tokens
if requested_tokens > self.MAX_MODEL_LEN:
print(1)
raise self.TokensExceedsMaximum(
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
f"Please reduce your prompt; or completion length."
)
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
num_logprobs = data.get('logprobs', -1)
if num_logprobs is None:
num_logprobs = 1
num_logprobs = -1
want_logprobs = num_logprobs > 0
temperature = data.get('temperature', 0.2)
@ -95,7 +109,7 @@ class CodeGenProxy:
top_p = data.get('top_p', 1.0)
frequency_penalty = data.get('frequency_penalty', 1.0)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
@ -103,9 +117,9 @@ class CodeGenProxy:
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
stop_words = data.get('stop', [])
if stop_words is None:
@ -220,8 +234,8 @@ class CodeGenProxy:
for c in choices:
completion['id'] = self.random_completion_id()
completion['choices'] = [c]
yield f'data: {json.dumps(completion)}\n\n'
yield 'data: [DONE]\n\n'
yield f'{json.dumps(completion)}'
yield '[DONE]'
def non_streamed_response(self, completion, choices) -> str:
completion['id'] = self.random_completion_id()
@ -230,7 +244,19 @@ class CodeGenProxy:
def __call__(self, data: dict):
st = time.time()
completion, choices = self.generate(data)
try:
completion, choices = self.generate(data)
except InferenceServerException as exc:
# status: unavailable -- this happens if the `model` string is invalid
print(exc)
if exc.status() == 'StatusCode.UNAVAILABLE':
print(
f"WARNING: Model '{data['model']}' is not available. Please ensure that "
"`model` is set to either 'fastertransformer' or 'py-model' depending on "
"your installation"
)
completion = {}
choices = []
ed = time.time()
print(f"Returned completion in {(ed - st) * 1000} ms")
if data.get('stream', False):

View file

@ -0,0 +1,19 @@
from typing import *
class FauxPilotException(Exception):
def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
super().__init__(message)
self.message = message
self.type = type
self.param = param
self.code = code
def json(self):
return {
'error': {
'message': self.message,
'type': self.type,
'param': self.param,
'code': self.code
}
}