rerank endpoint plugin

This commit is contained in:
2026-01-20 22:01:23 +01:00
parent 6c7f96145b
commit 8149ac8c8b
3 changed files with 119 additions and 53 deletions

View File

@@ -12,7 +12,7 @@ logging.basicConfig(
)
logger = logging.getLogger(__name__)
app = FastAPI(title="Ollama BGE Reranker (Working Workaround)")
app = FastAPI(title="Ollama Cross-Encoder Reranker API")
class RerankRequest(BaseModel):
model: str
@@ -56,17 +56,22 @@ async def score_document_cross_encoder_workaround(
index: int
) -> dict:
"""
Workaround for using BGE-reranker with Ollama.
Workaround for using cross-encoder reranker models with Ollama.
Works with: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
Based on: https://medium.com/@rosgluk/reranking-documents-with-ollama-and-qwen3-reranker-model-in-go-6dc9c2fb5f0b
Key discovery: When using concatenated query+doc embeddings,
The Problem: Cross-encoder models have a classification head that outputs relevance scores.
Ollama only exposes the embedding API, not the classification head.
The Workaround: When using concatenated query+doc embeddings with cross-encoders,
LOWER magnitude = MORE relevant. We invert the scores so that
higher values = more relevant (standard convention).
Steps:
1. Concatenate query and document in cross-encoder format
2. Get embedding of the concatenated text
3. Calculate magnitude (lower = more relevant)
3. Calculate magnitude (lower = more relevant for cross-encoders)
4. Invert and normalize to 0-1 (higher = more relevant)
"""
@@ -89,9 +94,10 @@ async def score_document_cross_encoder_workaround(
vec = np.array(embedding)
magnitude = float(np.linalg.norm(vec))
# CRITICAL DISCOVERY: For BGE-reranker via Ollama embeddings:
# CRITICAL DISCOVERY: For cross-encoder rerankers via Ollama embeddings:
# LOWER magnitude = MORE relevant document
# Observed range: ~15-25 (lower = better)
# This pattern applies to BGE, Qwen3-Reranker, and similar cross-encoder models
# Invert and normalize to 0-1 where higher score = more relevant
# Adjusted bounds based on empirical observations
@@ -123,19 +129,22 @@ async def check_ollama():
response = await client.get("http://localhost:11434/api/tags", timeout=5.0)
response.raise_for_status()
logger.info("✓ Successfully connected to Ollama")
logger.warning("⚠️ Using workaround: concatenation + magnitude")
logger.warning("⚠️ This is less accurate than proper cross-encoder usage")
logger.warning("⚠️ Using workaround: Ollama doesn't expose cross-encoder classification heads")
logger.warning("⚠️ Using concatenation + magnitude method instead")
logger.info("💡 Works with: BGE-reranker, Qwen3-Reranker, etc.")
except Exception as e:
logger.error(f"✗ Cannot connect to Ollama: {e}")
@app.post("/v1/rerank", response_model=RerankResponse)
async def rerank(request: RerankRequest):
"""
Rerank documents using BGE-reranker via Ollama workaround.
Rerank documents using cross-encoder models via Ollama workaround.
Supports: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
NOTE: This uses a workaround (magnitude of concatenated embeddings)
because Ollama doesn't expose BGE's classification head.
For best accuracy, use sentence-transformers directly.
because Ollama doesn't expose the cross-encoder classification head.
For best accuracy, use sentence-transformers or dedicated reranker APIs.
"""
if not request.documents:
raise HTTPException(status_code=400, detail="No documents provided")
@@ -168,18 +177,21 @@ def health_check():
"""Health check endpoint."""
return {
"status": "healthy",
"service": "ollama-bge-reranker-workaround",
"note": "Using magnitude workaround - less accurate than native"
"service": "ollama-cross-encoder-reranker",
"supported_models": "BGE-reranker, Qwen3-Reranker, etc.",
"method": "concatenation + magnitude workaround",
"note": "Ollama doesn't expose classification heads - using embedding magnitude"
}
if __name__ == "__main__":
import uvicorn
logger.info("=" * 60)
logger.info("Ollama BGE Reranker - WORKAROUND Implementation")
logger.info("Ollama Cross-Encoder Reranker API")
logger.info("=" * 60)
logger.info("Using concatenation + magnitude method")
logger.info("This works but is less accurate than proper cross-encoders")
logger.info("Supports: BGE-reranker, Qwen3-Reranker, etc.")
logger.info("Method: Concatenation + magnitude workaround")
logger.info("Why: Ollama doesn't expose cross-encoder classification heads")
logger.info("Starting on: http://0.0.0.0:8080")
logger.info("=" * 60)