Merge branch 'main' into main

2025-08-20 21:34:13 -07:00 · 2023-02-13 16:07:45 +01:00 · 2023-02-13 16:07:45 +01:00 · 7ebb56c551
commit 7ebb56c551
parent 17ba576021 9e60c2b644
37 changed files with 1156 additions and 121 deletions
--- a/copilot_proxy/Dockerfile
+++ b/copilot_proxy/Dockerfile
@ -10,4 +10,4 @@ COPY . .

 EXPOSE 5000

-CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
+CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
--- a/copilot_proxy/app.py
+++ b/copilot_proxy/app.py
@ -1,11 +1,17 @@
+import logging
 import os

 import uvicorn
-from fastapi import FastAPI, Response
+from fastapi import FastAPI, Request, Response
+from fastapi.responses import JSONResponse
 from sse_starlette.sse import EventSourceResponse

+from config.log_config import uvicorn_logger
 from models import OpenAIinput
 from utils.codegen import CodeGenProxy
+from utils.errors import FauxPilotException
+
+logging.config.dictConfig(uvicorn_logger)

 codegen = CodeGenProxy(
    host=os.environ.get("TRITON_HOST", "triton"),
@ -21,24 +27,50 @@ app = FastAPI(
    swagger_ui_parameters={"defaultModelsExpandDepth": -1}
 )

+@app.exception_handler(FauxPilotException)
+async def fauxpilot_handler(request: Request, exc: FauxPilotException):
+    return JSONResponse(
+        status_code=400,
+        content=exc.json()
+    )

-@app.post("/v1/engines/codegen/completions", status_code=200)
-@app.post("/v1/completions", status_code=200)
+# Used to support copilot.vim 
+@app.get("/copilot_internal/v2/token")
+def get_copilot_token():
+    content = {'token': '1', 'expires_at': 2600000000, 'refresh_in': 900}
+    return JSONResponse(
+        status_code=200,
+        content=content
+    )
+
+@app.post("/v1/engines/codegen/completions")
+# Used to support copilot.vim 
+@app.post("/v1/engines/copilot-codex/completions")
+@app.post("/v1/completions")
 async def completions(data: OpenAIinput):
    data = data.dict()
-    print(data)
+    try:
+        content = codegen(data=data)
+    except codegen.TokensExceedsMaximum as E:
+        raise FauxPilotException(
+            message=str(E),
+            type="invalid_request_error",
+            param=None,
+            code=None,
+        )
+
    if data.get("stream") is not None:
        return EventSourceResponse(
-            content=codegen(data=data),
+            content=content,
            status_code=200,
            media_type="text/event-stream"
        )
    else:
        return Response(
            status_code=200,
-            content=codegen(data=data),
+            content=content,
            media_type="application/json"
        )

 if __name__ == "__main__":
-    uvicorn.run("app:app", host=os.environ.get("API_HOST", "0.0.0.0"), port=os.environ.get("API_PORT", 5000))
+    uvicorn.run("app:app", host="0.0.0.0", port=5000)
--- a/copilot_proxy/config/init.py
+++ b/copilot_proxy/config/init.py
--- a/copilot_proxy/config/log_config.py
+++ b/copilot_proxy/config/log_config.py
@ -0,0 +1,27 @@
+# The uvicorn_logger is used to add timestamps
+
+uvicorn_logger = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "access": {
+            "()": "uvicorn.logging.AccessFormatter",
+            "fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
+            "use_colors": True
+        },
+    },
+    "handlers": {
+        "access": {
+            "formatter": "access",
+            "class": "logging.StreamHandler",
+            "stream": "ext://sys.stdout",
+        },
+    },
+    "loggers": {
+        "uvicorn.access": {
+            "handlers": ["access"],
+            # "level": "INFO",
+            "propagate": False
+        },
+    },
+}
--- a/copilot_proxy/models.py
+++ b/copilot_proxy/models.py
@ -1,10 +1,10 @@
 from typing import Optional, Union

-from pydantic import BaseModel
+from pydantic import BaseModel, constr


 class OpenAIinput(BaseModel):
-    model: str
+    model: constr(regex="^(fastertransformer|py-model)$") = "fastertransformer"
    prompt: Optional[str]
    suffix: Optional[str]
    max_tokens: Optional[int] = 16
--- a/copilot_proxy/utils/codegen.py
+++ b/copilot_proxy/utils/codegen.py
@ -6,14 +6,14 @@ import time
 import numpy as np
 import tritonclient.grpc as client_util
 from tokenizers import Tokenizer
-from tritonclient.utils import np_to_triton_dtype
+from tritonclient.utils import np_to_triton_dtype, InferenceServerException

 np.finfo(np.dtype("float32"))
 np.finfo(np.dtype("float64"))


 class CodeGenProxy:
-    def __init__(self, host: str = 'localhost', port: int = 8001, verbose: bool = False):
+    def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
        self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
        self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
        self.PAD_CHAR = 50256
@ -21,6 +21,9 @@ class CodeGenProxy:
        # Max number of tokens the model can handle
        self.MAX_MODEL_LEN = 2048

+    class TokensExceedsMaximum(Exception):
+        pass
+
    @staticmethod
    def prepare_tensor(name: str, tensor_input):
        t = client_util.InferInput(
@ -70,20 +73,31 @@ class CodeGenProxy:
        return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))

    def generate(self, data):
-        model_name = "fastertransformer"
        prompt = data['prompt']
        n = data.get('n', 1)
+        model_name = data["model"]
+        # ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
+        # i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
+        np_type = np.int32 if model_name.startswith("py-") else np.uint32
+
        input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
-        input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
+        input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
        prompt_len = input_start_ids.shape[1]
-        input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
        max_tokens = data.get('max_tokens', 16)
-        if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
-            raise ValueError("Max tokens + prompt length exceeds maximum model length")
-        output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
+        prompt_tokens: int = input_len[0][0]
+        requested_tokens = max_tokens + prompt_tokens
+        if requested_tokens > self.MAX_MODEL_LEN:
+            print(1)
+            raise self.TokensExceedsMaximum(
+                f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
+                f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
+                f"Please reduce your prompt; or completion length."
+            )
+        output_len = np.ones_like(input_len).astype(np_type) * max_tokens
        num_logprobs = data.get('logprobs', -1)
        if num_logprobs is None:
-            num_logprobs = 1
+            num_logprobs = -1
        want_logprobs = num_logprobs > 0

        temperature = data.get('temperature', 0.2)
@ -95,7 +109,7 @@ class CodeGenProxy:

        top_p = data.get('top_p', 1.0)
        frequency_penalty = data.get('frequency_penalty', 1.0)
-        runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
        runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
        beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
        random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
@ -103,9 +117,9 @@ class CodeGenProxy:
        len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
        repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
        is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
-        beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
-        start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
-        end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
+        beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
+        start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
+        end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)

        stop_words = data.get('stop', [])
        if stop_words is None:
@ -220,8 +234,8 @@ class CodeGenProxy:
        for c in choices:
            completion['id'] = self.random_completion_id()
            completion['choices'] = [c]
-            yield f'data: {json.dumps(completion)}\n\n'
-        yield 'data: [DONE]\n\n'
+            yield f'{json.dumps(completion)}'
+        yield '[DONE]'

    def non_streamed_response(self, completion, choices) -> str:
        completion['id'] = self.random_completion_id()
@ -230,7 +244,19 @@ class CodeGenProxy:

    def __call__(self, data: dict):
        st = time.time()
-        completion, choices = self.generate(data)
+        try:
+            completion, choices = self.generate(data)
+        except InferenceServerException as exc:
+            # status: unavailable -- this happens if the `model` string is invalid
+            print(exc)
+            if exc.status() == 'StatusCode.UNAVAILABLE':
+                print(
+                    f"WARNING: Model '{data['model']}' is not available. Please ensure that "
+                    "`model` is set to either 'fastertransformer' or 'py-model' depending on "
+                    "your installation"
+                )
+            completion = {}
+            choices = []
        ed = time.time()
        print(f"Returned completion in {(ed - st) * 1000} ms")
        if data.get('stream', False):
--- a/copilot_proxy/utils/errors.py
+++ b/copilot_proxy/utils/errors.py
@ -0,0 +1,19 @@
+from typing import *
+
+class FauxPilotException(Exception):
+    def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
+        super().__init__(message)
+        self.message = message
+        self.type = type
+        self.param = param
+        self.code = code
+
+    def json(self):
+        return {
+            'error': {
+                'message': self.message,
+                'type': self.type,
+                'param': self.param,
+                'code': self.code
+            }
+        }