Merge branch 'main' into python_backend

Signed-off-by: Parth Thakkar <thakkarparth007@gmail.com>
2025-08-14 02:28:06 -07:00 · 2022-11-09 12:51:16 -06:00 · 2022-11-09 12:51:16 -06:00 · f0a12b5e8e
commit f0a12b5e8e
parent fa423d1da1 e5825da8be
10 changed files with 145 additions and 13 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,49 @@
 # EditorConfig is awesome: https://EditorConfig.org
 # top-most EditorConfig file
 root = true
 [*]
 indent_style = space
 indent_size = 4
 tab_width = 4
 # end_of_line = crlf
 charset = utf-8
 trim_trailing_whitespace = true
 insert_final_newline = false
 # Markdown
 [*.{md}]
 indent_style = space
 indent_size = 2
 insert_final_newline = true
 # Serialized data
 [*.{yml,yaml,json,pbtxt}]
 indent_style = space
 indent_size = 2
 insert_final_newline = true
 # Shell script
 [*.{sh,bash,bashrc,zsh,fish,ksh,csh}]
 indent_style = space
 indent_size = 4
 insert_final_newline = true
 # Python
 [*.py]
 indent_style = space
 indent_size = 4
 insert_final_newline = true
 # Environment
 [*.env]
 insert_final_newline = false
 # Python requirements
 [requirements.txt]
 insert_final_newline = true
 # Dockerfile
 [Dockerfile]
 insert_final_newline = true
--- a/README.md
+++ b/README.md
@ -1,7 +1,11 @@
 # FauxPilot
 This is an attempt to build a locally hosted version of [GitHub Copilot](https://copilot.github.com/). It uses the [SalesForce CodeGen](https://github.com/salesforce/CodeGen) models inside of NVIDIA's [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server) with the [FasterTransformer backend](https://github.com/triton-inference-server/fastertransformer_backend/).
 <p align="right">
  <img width="50%" align="right" src="https://user-images.githubusercontent.com/34380/199604776-e08ce261-48d1-4d09-9163-f5ab2f904e55.png">
 </p>
 ## Prerequisites
--- a/copilot_proxy/app.py
+++ b/copilot_proxy/app.py
@ -1,11 +1,17 @@
 import logging
 import os
 import uvicorn
-from fastapi import FastAPI, Response
+from fastapi import FastAPI, Request, Response
 from fastapi.responses import JSONResponse
 from sse_starlette.sse import EventSourceResponse
 from config.log_config import uvicorn_logger
 from models import OpenAIinput
 from utils.codegen import CodeGenProxy
 from utils.errors import FauxPilotException
 logging.config.dictConfig(uvicorn_logger)
 codegen = CodeGenProxy(
    host=os.environ.get("TRITON_HOST", "triton"),
@ -21,22 +27,37 @@ app = FastAPI(
    swagger_ui_parameters={"defaultModelsExpandDepth": -1}
 )
@app.exception_handler(FauxPilotException)
 async def fauxpilot_handler(request: Request, exc: FauxPilotException):
    return JSONResponse(
        status_code=400,
        content=exc.json()
    )
-@app.post("/v1/engines/codegen/completions", status_code=200)
+@app.post("/v1/engines/codegen/completions")
-@app.post("/v1/completions", status_code=200)
+@app.post("/v1/completions")
 async def completions(data: OpenAIinput):
    data = data.dict()
-    print(data)
+    try:
        content = codegen(data=data)
    except codegen.TokensExceedsMaximum as E:
        raise FauxPilotException(
            message=str(E),
            type="invalid_request_error",
            param=None,
            code=None,
        )
    if data.get("stream") is not None:
        return EventSourceResponse(
-            content=codegen(data=data),
+            content=content,
            status_code=200,
            media_type="text/event-stream"
        )
    else:
        return Response(
            status_code=200,
-            content=codegen(data=data),
+            content=content,
            media_type="application/json"
        )
--- a/copilot_proxy/config/init.py
+++ b/copilot_proxy/config/init.py
--- a/copilot_proxy/config/log_config.py
+++ b/copilot_proxy/config/log_config.py
@ -0,0 +1,27 @@
 # The uvicorn_logger is used to add timestamps
 uvicorn_logger = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
        "access": {
            "()": "uvicorn.logging.AccessFormatter",
            "fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
            "use_colors": True
        },
    },
    "handlers": {
        "access": {
            "formatter": "access",
            "class": "logging.StreamHandler",
            "stream": "ext://sys.stdout",
        },
    },
    "loggers": {
        "uvicorn.access": {
            "handlers": ["access"],
            # "level": "INFO",
            "propagate": False
        },
    },
 }
--- a/copilot_proxy/utils/codegen.py
+++ b/copilot_proxy/utils/codegen.py
@ -21,6 +21,9 @@ class CodeGenProxy:
        # Max number of tokens the model can handle
        self.MAX_MODEL_LEN = 2048
    class TokensExceedsMaximum(Exception):
        pass
    @staticmethod
    def prepare_tensor(name: str, tensor_input):
        t = client_util.InferInput(
@ -82,8 +85,15 @@ class CodeGenProxy:
        prompt_len = input_start_ids.shape[1]
        input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
        max_tokens = data.get('max_tokens', 16)
-        if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
+        prompt_tokens: int = input_len[0][0]
-            raise ValueError("Max tokens + prompt length exceeds maximum model length")
+        requested_tokens = max_tokens + prompt_tokens
        if requested_tokens > self.MAX_MODEL_LEN:
            print(1)
            raise self.TokensExceedsMaximum(
                f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
                f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
                f"Please reduce your prompt; or completion length."
            )
        output_len = np.ones_like(input_len).astype(np_type) * max_tokens
        num_logprobs = data.get('logprobs', -1)
        if num_logprobs is None:
--- a/copilot_proxy/utils/errors.py
+++ b/copilot_proxy/utils/errors.py
@ -0,0 +1,19 @@
 from typing import *
 class FauxPilotException(Exception):
    def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
        super().__init__(message)
        self.message = message
        self.type = type
        self.param = param
        self.code = code
    def json(self):
        return {
            'error': {
                'message': self.message,
                'type': self.type,
                'param': self.param,
                'code': self.code
            }
        }
--- a/documentation/client.md
+++ b/documentation/client.md
@ -64,3 +64,9 @@ And you should be able to use Copilot with your own locally hosted suggestions!
 Another issue with using the Copilot plugin is that its tokenizer (the component that turns text into a sequence of integers for the model) is slightly different from the one used by CodeGen, so the plugin will sometimes send a request that is longer than CodeGen can handle. You can work around this by replacing the `vocab.bpe` and `tokenizer.json` found in the Copilot extension (something like `.vscode/extensions/github.copilot-[version]/dist/`) with the ones found [here](https://github.com/moyix/fauxpilot/tree/main/copilot_proxy/cgtok/openai_format).
 Have fun!
 ## GitLab - VS Code extentension  
 Another option is to use the [GitLab VS Code extension](https://marketplace.visualstudio.com/items?itemName=GitLab.gitlab-workflow) which has support for FauxPilot.  
 Contributions are encouraged :smile: https://gitlab.com/gitlab-org/gitlab-vscode-extension
--- a/example.env
+++ b/example.env
@ -1,4 +0,0 @@
 TRITON_HOST=triton
 TRITON_PORT=8001
 API_HOST=0.0.0.0
 API_PORT=5000
--- a/tests/python_backend/test_setup.py
+++ b/tests/python_backend/test_setup.py
@ -160,5 +160,5 @@ def test_python_backend(n_gpus: int):
        # killing docker-compose process doesn't bring down the containers.
        # explicitly stop the containers:
-        subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True)
+        subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True, env=load_test_env())