rerank endpoint plugin

2026-01-20 22:01:23 +01:00
parent 6c7f96145b
commit 8149ac8c8b
3 changed files with 119 additions and 53 deletions
--- a/plugins/reranking-endpoint/api.py
+++ b/plugins/reranking-endpoint/api.py
@@ -12,7 +12,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)

-app = FastAPI(title="Ollama BGE Reranker (Working Workaround)")
+app = FastAPI(title="Ollama Cross-Encoder Reranker API")

 class RerankRequest(BaseModel):
    model: str
@@ -56,17 +56,22 @@ async def score_document_cross_encoder_workaround(
    index: int
 ) -> dict:
    """
-    Workaround for using BGE-reranker with Ollama.
+    Workaround for using cross-encoder reranker models with Ollama.
+    
+    Works with: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
    Based on: https://medium.com/@rosgluk/reranking-documents-with-ollama-and-qwen3-reranker-model-in-go-6dc9c2fb5f0b
    
-    Key discovery: When using concatenated query+doc embeddings,
+    The Problem: Cross-encoder models have a classification head that outputs relevance scores.
+    Ollama only exposes the embedding API, not the classification head.
+    
+    The Workaround: When using concatenated query+doc embeddings with cross-encoders,
    LOWER magnitude = MORE relevant. We invert the scores so that
    higher values = more relevant (standard convention).
    
    Steps:
    1. Concatenate query and document in cross-encoder format
    2. Get embedding of the concatenated text
-    3. Calculate magnitude (lower = more relevant)
+    3. Calculate magnitude (lower = more relevant for cross-encoders)
    4. Invert and normalize to 0-1 (higher = more relevant)
    """
    
@@ -89,9 +94,10 @@ async def score_document_cross_encoder_workaround(
    vec = np.array(embedding)
    magnitude = float(np.linalg.norm(vec))
    
-    # CRITICAL DISCOVERY: For BGE-reranker via Ollama embeddings:
+    # CRITICAL DISCOVERY: For cross-encoder rerankers via Ollama embeddings:
    # LOWER magnitude = MORE relevant document
    # Observed range: ~15-25 (lower = better)
+    # This pattern applies to BGE, Qwen3-Reranker, and similar cross-encoder models
    
    # Invert and normalize to 0-1 where higher score = more relevant
    # Adjusted bounds based on empirical observations
@@ -123,19 +129,22 @@ async def check_ollama():
            response = await client.get("http://localhost:11434/api/tags", timeout=5.0)
            response.raise_for_status()
            logger.info("✓ Successfully connected to Ollama")
-            logger.warning("⚠️  Using workaround: concatenation + magnitude")
-            logger.warning("⚠️  This is less accurate than proper cross-encoder usage")
+            logger.warning("⚠️  Using workaround: Ollama doesn't expose cross-encoder classification heads")
+            logger.warning("⚠️  Using concatenation + magnitude method instead")
+            logger.info("💡 Works with: BGE-reranker, Qwen3-Reranker, etc.")
    except Exception as e:
        logger.error(f"✗ Cannot connect to Ollama: {e}")

@app.post("/v1/rerank", response_model=RerankResponse)
 async def rerank(request: RerankRequest):
    """
-    Rerank documents using BGE-reranker via Ollama workaround.
+    Rerank documents using cross-encoder models via Ollama workaround.
+    
+    Supports: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
    
    NOTE: This uses a workaround (magnitude of concatenated embeddings)
-    because Ollama doesn't expose BGE's classification head.
-    For best accuracy, use sentence-transformers directly.
+    because Ollama doesn't expose the cross-encoder classification head.
+    For best accuracy, use sentence-transformers or dedicated reranker APIs.
    """
    if not request.documents:
        raise HTTPException(status_code=400, detail="No documents provided")
@@ -168,18 +177,21 @@ def health_check():
    """Health check endpoint."""
    return {
        "status": "healthy",
-        "service": "ollama-bge-reranker-workaround",
-        "note": "Using magnitude workaround - less accurate than native"
+        "service": "ollama-cross-encoder-reranker",
+        "supported_models": "BGE-reranker, Qwen3-Reranker, etc.",
+        "method": "concatenation + magnitude workaround",
+        "note": "Ollama doesn't expose classification heads - using embedding magnitude"
    }

 if __name__ == "__main__":
    import uvicorn
    
    logger.info("=" * 60)
-    logger.info("Ollama BGE Reranker - WORKAROUND Implementation")
+    logger.info("Ollama Cross-Encoder Reranker API")
    logger.info("=" * 60)
-    logger.info("Using concatenation + magnitude method")
-    logger.info("This works but is less accurate than proper cross-encoders")
+    logger.info("Supports: BGE-reranker, Qwen3-Reranker, etc.")
+    logger.info("Method: Concatenation + magnitude workaround")
+    logger.info("Why: Ollama doesn't expose cross-encoder classification heads")
    logger.info("Starting on: http://0.0.0.0:8080")
    logger.info("=" * 60)