Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions llamacpp-embedding-rerank-server/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# llama.cpp 部署 Embedding和Rerank模型

## ARM架构下编译llama.cpp

1. 下载llama.cpp源码
```shell
cd /home
git clone https://github.com/ggml-org/llama.cpp.git
```

2. 拉取``ubuntu:22.04``镜像

```shell
docker pull ubuntu:22.04
docker run -itd -v /home/llama.cpp:/home/llama.cpp ubuntu:22.04 --name ubuntu_llamacpp

docker exec -it ubuntu_llamacpp /bin/bash
```

3. 容器内编译

```shell
apt-get update
apt-get install -y git cmake libcurl4-openssl-dev build-essential
cmake -B build
cmake --build build --config Release -j 8
```

4. 宿主机上构建llama.cpp镜像

经过在上面容器内编译的步骤,``/home/llama.cpp``下新增``build``文件夹,里面就是基于``ubuntu:22.04``镜像环境编译的llama.cpp的库文件。下面就可根据这些文件再基于``ubuntu:22.04``构建``llama.cpp-arm64:server``镜像。

```shell
cd /home/llama.cpp
mkdir -p docker/lib
cp -r ../build/bin/*.so ./docker/lib/
cp ../build/bin/llama-server ./docker/llama-server
vi Dockerfile
```

``Dockerfile``内容

```shell
FROM ubuntu:22.04

RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
&& find /var/cache -type f -delete

COPY lib/ /app
COPY llama-server /app

ENV LLAMA_ARG_HOST=0.0.0.0

WORKDIR /app

ENTRYPOINT ["/app/llama-server"]
```

构建镜像

```shell
docker build -t llama.cpp-arm64:server .
```

## 下载模型

`Embedding`: bge-m3

`Rerank`: bge-reranker-v2-m3

```shell
pip config set global.index-url https://repo.huaweicloud.com/repository/pypi/simple
pip install modelscope
cd /home
mkdir models
cd models

modelscope download --model gpustack/bge-reranker-v2-m3-GGUF bge-reranker-v2-m3-Q8_0.gguf --local_dir ./bge-reranker-v2-m3-GGUF
modelscope download --model gpustack/bge-m3-GGUF bge-m3-Q8_0.gguf --local_dir ./bge-m3-GGUF
```

## 编制docker-compose文件

```shell
services:
llamacpp-embedding-server:
image: llama.cpp-arm64:server
container_name: llamacpp-embedding-server
command: --embedding --pooling mean --verbose-prompt
restart: always
ports:
- 8081:8080
volumes:
- /home/models/bge-m3-GGUF:/models
environment:
LLAMA_ARG_MODEL: /models/bge-m3-Q8_0.gguf
LLAMA_ARG_CTX_SIZE: 8192
LLAMA_ARG_N_PARALLEL: 8
LLAMA_ARG_PORT: 8080
LLAMA_ARG_UBATCH: 8192
LLAMA_ARG_N_GPU_LAYERS_DRAFT: 0

llamacpp-rerank-server:
image: llama.cpp-arm64:server
container_name: llamacpp-rerank-server
command: --reranking --pooling rank
restart: always
ports:
- 8082:8080
volumes:
- /home/models/bge-reranker-v2-m3-GGUF:/models
environment:
LLAMA_ARG_MODEL: /models/bge-reranker-v2-m3-Q8_0.gguf
LLAMA_ARG_CTX_SIZE: 8192
LLAMA_ARG_N_PARALLEL: 8
LLAMA_ARG_PORT: 8080
LLAMA_ARG_BATCH: 8192
LLAMA_ARG_UBATCH: 8192
LLAMA_ARG_FLASH_ATTN: enable
LLAMA_ARG_N_GPU_LAYERS_DRAFT: 0
```

## 部署模型

```shell
docker compose up -d
```

Embedding和Rerank模型服务支持标准OpenAI接口规范
36 changes: 36 additions & 0 deletions llamacpp-embedding-rerank-server/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
services:
llamacpp-embedding-server:
image: ghcr.io/ggml-org/llama.cpp:server
container_name: llamacpp-embedding-server
command: --embedding --pooling mean --verbose-prompt
restart: always
ports:
- 8081:8080
volumes:
- /home/models/bge-m3-GGUF:/models
environment:
LLAMA_ARG_MODEL: /models/bge-m3-Q8_0.gguf
LLAMA_ARG_CTX_SIZE: 8192
LLAMA_ARG_N_PARALLEL: 8
LLAMA_ARG_PORT: 8080
LLAMA_ARG_UBATCH: 8192
LLAMA_ARG_N_GPU_LAYERS_DRAFT: 0

llamacpp-rerank-server:
image: ghcr.io/ggml-org/llama.cpp:server
container_name: llamacpp-rerank-server
command: --reranking --pooling rank
restart: always
ports:
- 8082:8080
volumes:
- /home/models/bge-reranker-v2-m3-GGUF:/models
environment:
LLAMA_ARG_MODEL: /models/bge-reranker-v2-m3-Q8_0.gguf
LLAMA_ARG_CTX_SIZE: 8192
LLAMA_ARG_N_PARALLEL: 8
LLAMA_ARG_PORT: 8080
LLAMA_ARG_BATCH: 8192
LLAMA_ARG_UBATCH: 8192
LLAMA_ARG_FLASH_ATTN: enable
LLAMA_ARG_N_GPU_LAYERS_DRAFT: 0
1 change: 1 addition & 0 deletions onnxruntime-embedding-rerank-server/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.12
Empty file.
19 changes: 19 additions & 0 deletions onnxruntime-embedding-rerank-server/app/download_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
from modelscope import snapshot_download

# 当前文件的绝对路径
filepath = os.path.abspath(__file__)
models_dir = os.path.join(os.path.dirname(os.path.dirname(filepath)), "models")
os.makedirs(models_dir, exist_ok=True)

def download_embedding_models():
embedding_dir = os.path.join(models_dir,"bce-embedding-base_v1")
if not os.path.exists(embedding_dir):
snapshot_download(model_id='netease-youdao/bce-embedding-base_v1',local_dir = embedding_dir)
return embedding_dir

def download_rerank_models():
reranker_dir = os.path.join(models_dir,"bce-reranker-base_v1")
if not os.path.exists(reranker_dir):
snapshot_download(model_id='netease-youdao/bce-reranker-base_v1',local_dir = reranker_dir)
return reranker_dir
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from fastapi import APIRouter, Request, HTTPException
from typing import List,Union,Optional
from pydantic import BaseModel,Field
from uuid import uuid4
from log_config import logger
from fastapi import APIRouter, Request
from typing import List,Union
from pydantic import BaseModel
from model_service import EmbeddingService
import time
import logging


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

embedding_router = APIRouter(tags=['Embedding'])

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from onnxruntime import InferenceSession
from transformers import AutoTokenizer
from transformers import XLMRobertaTokenizer
from concurrent.futures import ThreadPoolExecutor
from typing import List
from log_config import logger
import numpy as np
import os

Expand All @@ -13,7 +12,7 @@ def __init__(self,embed_path:str,**kwargs):
if not os.path.isfile(embed_model_path):
raise ValueError(f"{embed_model_path} 文件不存在")
self.workers = kwargs.get('workers', 8)
self._tokenizer = AutoTokenizer.from_pretrained(embed_path)
self._tokenizer = XLMRobertaTokenizer.from_pretrained(embed_path)
providers = ['CPUExecutionProvider']
self._session = InferenceSession(embed_model_path, providers=providers)
self.thread_pool = ThreadPoolExecutor(max_workers=self.workers)
Expand Down
43 changes: 43 additions & 0 deletions onnxruntime-embedding-rerank-server/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from fastapi import FastAPI
from fastapi.concurrency import asynccontextmanager
from model_service import EmbeddingService,RerankService
from embedding_api import embedding_router
from rerank_api import rerank_router
from download_model import download_embedding_models,download_rerank_models
import logging
import os
import uvicorn

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

EMBEDDING_MODEL_PATH = os.getenv("LOCAL_EMBEDDING_MODEL_PATH",download_embedding_models())
RERANK_MODEL_PATH = os.getenv("LOCAL_RERANK_MODEL_PATH",download_rerank_models())

logger.info(f"EMBEDDING_MODEL_PATH:{EMBEDDING_MODEL_PATH}")
logger.info(f"RERANK_MODEL_PATH:{RERANK_MODEL_PATH}")

@asynccontextmanager
async def lifespan(app:FastAPI):
"""生命周期"""
app.state.embed_engine = EmbeddingService(EMBEDDING_MODEL_PATH, 8)
app.state.rerank_engine = RerankService(RERANK_MODEL_PATH, 8)
logger.info("应用初始化成功")
yield
logger.info("应用正常停止")

# 创建应用
app = FastAPI(
title="Embedding and Reranker Inference Server",
lifespan=lifespan,
root_path="/v1",
version="1.0",
license_info={"name":"Apache License 2.0","identifier":"Apache"}
)

app.include_router(embedding_router)
app.include_router(rerank_router)

if __name__ == "__main__":
# 启动应用
uvicorn.run(app, host="0.0.0.0", port=9997)
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from fastapi import APIRouter, Request
from typing import List
from pydantic import BaseModel,Field
from pydantic import BaseModel
from uuid import uuid4
from log_config import logger
from model_service import RerankService
import time
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

rerank_router = APIRouter(tags=['Rerank'])

Expand Down Expand Up @@ -40,7 +43,7 @@ def text_rerank(request: Request,rerank_request:RerankRequest) -> RerankResponse
results = engine.execute(query,documents,top_n,return_documents)

# usage = Usage(prompt_tokens=len(documents),total_tokens=total_tokens)
logger.info("请求结果:\n%s\n耗时 %.4f ms",results,(time.perf_counter() - start_time) * 1000)
logger.info("请求耗时 %.4f ms,返回结果:\n%s",(time.perf_counter() - start_time) * 1000, results)
rerankResult = [
RerankResult(
index = result['id'],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import onnxruntime as ort
from transformers import AutoTokenizer
from transformers import XLMRobertaTokenizer
from typing import List
from copy import deepcopy
from log_config import logger
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import os
Expand All @@ -21,7 +20,7 @@ def __init__(self,rerank_path,**kwargs):
# 判断文件是否存在
if not os.path.isfile(rerank_model_path):
raise ValueError(f"{rerank_model_path} 文件不存在")
self._tokenizer = AutoTokenizer.from_pretrained(rerank_path)
self._tokenizer = XLMRobertaTokenizer.from_pretrained(rerank_path)
self.sep_id = self._tokenizer.sep_token_id
self.overlap_tokens = kwargs.get('overlap_tokens', 80)
self.max_length = kwargs.get('max_length', 512)
Expand All @@ -30,7 +29,6 @@ def __init__(self,rerank_path,**kwargs):
self.session = ort.InferenceSession(rerank_model_path, providers=providers)
self.thread_pool = ThreadPoolExecutor(max_workers=self.workers)


def tokenize_preproc(self, query: str, passages: List[str]):

def _merge_inputs(chunk1_raw, chunk2):
Expand Down
14 changes: 14 additions & 0 deletions onnxruntime-embedding-rerank-server/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[project]
name = "onnxruntime-embedding-rerank-server"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"fastapi>=0.115.13",
"modelscope>=1.27.1",
"onnxruntime>=1.22.0",
"sentencepiece>=0.2.0",
"tiktoken>=0.9.0",
"transformers>=4.52.4",
]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
nohup python -u app/main.py > logs/app.log 2>&1 &
nohup uv run app/main.py > app.log 2>&1 &
PID1=$!

# 生成close.sh脚本,写入kill命令
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime-embedding-rerank-server/stop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
kill -9 28070
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import string

client = HttpClient("http://127.0.0.1:9997/v1")
client = HttpClient("http://115.120.51.57:11558/v1")

query = "抗生素是谁发现的?请说明发现过程和时间。"
documents = [
Expand All @@ -14,7 +14,7 @@
]

payload = {
"model": "bce",
"model": "Qwen/Qwen3-Reranker-8B",
"query": query,
"documents": documents,
"top_n": 3, # 可选参数
Expand Down
Loading