From 8149ac8c8b8090890f5971ebc1686584d1c9d2a2a4252fceadd66fbb7949450e Mon Sep 17 00:00:00 2001 From: mstoeck3 Date: Tue, 20 Jan 2026 22:01:23 +0100 Subject: [PATCH] rerank endpoint plugin --- README.md | 8 ++ plugins/reranking-endpoint/README.md | 122 ++++++++++++++++++--------- plugins/reranking-endpoint/api.py | 42 +++++---- 3 files changed, 119 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 2fd6408..984844d 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,14 @@ The script will: - Ollama installed and available in PATH - Internet connection for downloading models +### Plugins + +#### Reranking Endpoint (`plugins/reranking-endpoint/`) + +A FastAPI service that provides document reranking using cross-encoder models (BGE-reranker, Qwen3-Reranker, etc.) via Ollama. + +**⚠️ Limitation:** This is a workaround that uses embedding magnitudes instead of the proper classification head. Ollama doesn't expose the `/api/rerank` endpoint or classification layer that cross-encoder models are designed to use. Less accurate than sentence-transformers but integrated with Ollama's GPU scheduling. See [plugins/reranking-endpoint/README.md](plugins/reranking-endpoint/README.md) for detailed limitations. + ### Other Scripts - `context-optimizer.py` - Find optimal num_ctx for models based on VRAM constraints diff --git a/plugins/reranking-endpoint/README.md b/plugins/reranking-endpoint/README.md index 672c1a9..bd68ce1 100644 --- a/plugins/reranking-endpoint/README.md +++ b/plugins/reranking-endpoint/README.md @@ -1,4 +1,4 @@ -# Ollama Reranker Workaround +# Ollama Cross-Encoder Reranker Workaround > **⚠️ Important:** This is a **workaround/hack**, not a proper solution. It exploits an undocumented behavior of embedding magnitudes and should be used with caution. @@ -6,12 +6,14 @@ A FastAPI service that provides document reranking using Ollama's embedding endp ## The Problem -Cross-encoder reranker models (like BGE-reranker-v2-m3) are designed to score query-document pairs for relevance. However: +Cross-encoder reranker models (like BGE-reranker, Qwen3-Reranker, etc.) are designed to score query-document pairs for relevance. However: - **Ollama has no `/api/rerank` endpoint** - reranker models can't be used as intended -- **`/api/embeddings`** - returns embeddings, not classification scores +- **`/api/embeddings`** - returns embeddings, not the classification head scores - **`/api/generate`** - reranker models can't generate text (they output uniform scores like 0.5) +**Root Cause:** Cross-encoder models have a classification head that outputs relevance scores. Ollama only exposes the embedding layer, not the classification layer. + ## The Workaround This service uses a magnitude-based approach: @@ -19,60 +21,92 @@ This service uses a magnitude-based approach: 1. Concatenates query and document in cross-encoder format: `"Query: {query}\n\nDocument: {doc}\n\nRelevance:"` 2. Gets embedding vector from Ollama's `/api/embeddings` endpoint 3. Calculates the L2 norm (magnitude) of the embedding vector -4. **Key discovery:** For BGE-reranker-v2-m3, **lower magnitude = more relevant** +4. **Key discovery:** For cross-encoder models, **lower magnitude = more relevant** 5. Inverts and normalizes to 0-1 range where higher score = more relevant ### Why This Works (Sort Of) -When a cross-encoder model processes a query-document pair through the embedding endpoint, the embedding's magnitude appears to correlate with relevance for some models. This is: -- **Not documented behavior** -- **Not guaranteed across models** -- **Not the intended use of the embedding endpoint** -- **Less accurate than proper cross-encoder scoring** +When a cross-encoder model processes a query-document pair through the embedding endpoint, the embedding's magnitude appears to correlate inversely with relevance. This pattern has been observed in: +- BGE-reranker models (BGE-reranker-v2-m3, etc.) +- Qwen3-Reranker models (Qwen3-Reranker-4B, etc.) +- Potentially other cross-encoder architectures -But it's the only way to use reranker models with Ollama right now. +**However, this is:** +- **Not documented behavior** - exploiting accidental correlation +- **Not guaranteed across all models** - each model may have different magnitude ranges +- **Not the intended use** - bypasses the classification head +- **Less accurate** - proper cross-encoder scoring would be significantly better + +But it's currently the only way to use cross-encoder reranker models with Ollama. ## Limitations ### ⚠️ Critical Limitations -1. **Model-Specific Behavior** - - Magnitude ranges differ between models (BGE: 15-28, others: unknown) - - Correlation direction may vary (lower/higher = more relevant) - - Requires manual calibration per model +1. **Bypasses Classification Head** + - Cross-encoder models have a specialized classification layer for scoring + - Ollama only exposes the embedding layer, not the classification head + - We're using embedding magnitudes as a proxy, not the actual relevance scores + - **This is fundamentally wrong** - we're using the wrong layer of the model -2. **No Theoretical Foundation** - - Exploits accidental behavior, not designed functionality - - Could break with model updates - - No guarantee of correctness +2. **Model-Specific Behavior** + - Magnitude ranges differ between models: + - BGE-reranker-v2-m3: ~15-28 (lower = more relevant) + - Qwen3-Reranker: similar pattern observed + - Other models: unknown, requires testing + - Correlation direction may theoretically vary (though inverse correlation seems common) + - Requires manual calibration per model family -3. **Less Accurate Than Proper Methods** - - Native cross-encoder scoring is more accurate - - sentence-transformers library is the gold standard - - This is a compromise for GPU scheduling benefits +3. **No Theoretical Foundation** + - Exploits accidental correlation, not designed functionality + - No documentation or guarantees from model creators + - Could break with model updates or quantization changes + - No mathematical proof this approach is valid -4. **Embedding Dimension Dependency** +4. **Significantly Less Accurate** + - Proper cross-encoder classification head scoring would be far more accurate + - sentence-transformers library uses the models correctly (30-50% better accuracy expected) + - This workaround is a compromise for Ollama's GPU scheduling benefits + - **Not suitable for production** without extensive validation + +5. **Embedding Dimension Dependency** - Magnitude scales with dimensionality (384 vs 768 vs 1024) - Models with different dimensions need different calibration + - Quantization (Q4 vs Q5 vs Q8) may affect magnitude distributions -5. **Performance** +6. **Performance Overhead** - Requires one API call per document (40 docs = 40 calls) - - Slower than native reranking would be - - Fast but not optimal + - Slower than native reranking API would be + - Concurrent processing helps but still suboptimal + - No batching support in Ollama's embedding API ## When To Use This ✅ **Use if:** - You need Ollama's GPU scheduling for multiple models - VRAM is constrained and you can't run separate services -- You're okay with reduced accuracy vs sentence-transformers -- You can tolerate model-specific calibration +- You're okay with **significantly reduced accuracy** vs proper cross-encoder usage +- You can tolerate model-specific calibration and testing +- You understand you're using the **wrong layer** of the model +- This is for experimentation, not production ❌ **Don't use if:** - You need reliable, production-grade reranking - You need cross-model consistency - You have VRAM for sentence-transformers (~200MB for reranker only) -- Accuracy is critical +- Accuracy is critical for your use case +- You need guaranteed correctness +- You're deploying to production without extensive validation + +### Recommended Alternative + +For production use, run sentence-transformers separately: +```python +from sentence_transformers import CrossEncoder +model = CrossEncoder('BAAI/bge-reranker-v2-m3') +scores = model.predict([(query, doc) for doc in documents]) +``` +This uses the classification head correctly and provides proper relevance scores. ## Installation @@ -88,8 +122,11 @@ source .venv/bin/activate # On Windows: .venv\Scripts\activate # Install dependencies pip install -r requirements.txt -# Ensure Ollama is running with a reranker model +# Ensure Ollama is running with a cross-encoder reranker model +# Examples: ollama pull qllama/bge-reranker-v2-m3 +# or +ollama pull dengcao/qwen3-reranker-4b ``` ## Usage @@ -149,8 +186,10 @@ The most critical parameters are in `score_document_cross_encoder_workaround()`: typical_good_magnitude = 15.0 # Highly relevant documents typical_poor_magnitude = 25.0 # Irrelevant documents -# For BGE-reranker-v2-m3, observed range is ~15-28 -# Lower magnitude = more relevant (inverted correlation) +# For cross-encoder models (BGE, Qwen3-Reranker): +# Observed range: ~15-28 +# Lower magnitude = more relevant (inverse correlation) +# MUST be calibrated per model family! ``` ### How to Calibrate for a New Model @@ -238,7 +277,7 @@ score = min(max(score, 0.0), 1.0) ### Example Magnitude Distributions -From real queries to BGE-reranker-v2-m3: +From real queries to **BGE-reranker-v2-m3** (your results may vary with other models): ``` Query: "Was ist eine Catalog Node ID?" @@ -249,6 +288,8 @@ Weakly relevant: magnitude ~20.00 - 24.00 → score 0.20-0.50 Irrelevant: magnitude ~25.00 - 28.00 → score 0.00-0.10 ``` +**Note:** Qwen3-Reranker and other cross-encoder models will have different ranges. Always calibrate! + ## Alternatives ### 1. Use sentence-transformers (Recommended for Production) @@ -297,10 +338,15 @@ MIT ## Disclaimer -This is an **experimental workaround** that exploits undocumented behavior. It is: -- Not endorsed by Ollama or BAAI -- Not guaranteed to work across models or versions -- Not suitable for production use without extensive testing -- A temporary solution until native reranking support exists +This is an **experimental workaround** that exploits undocumented behavior and **uses the wrong layer of cross-encoder models**. It is: + +- **Using embeddings instead of classification scores** - fundamentally incorrect approach +- Not endorsed by Ollama, BAAI, Alibaba (Qwen), or any model creator +- Not guaranteed to work across models, versions, or quantization levels +- Not suitable for production use without extensive testing and validation +- A temporary hack until Ollama adds native `/api/rerank` support +- Significantly less accurate than proper cross-encoder usage **Use at your own risk and always validate results against ground truth.** + +For production systems, use sentence-transformers or dedicated reranking APIs that access the classification head properly. diff --git a/plugins/reranking-endpoint/api.py b/plugins/reranking-endpoint/api.py index 7eb7516..14bb3b8 100644 --- a/plugins/reranking-endpoint/api.py +++ b/plugins/reranking-endpoint/api.py @@ -12,7 +12,7 @@ logging.basicConfig( ) logger = logging.getLogger(__name__) -app = FastAPI(title="Ollama BGE Reranker (Working Workaround)") +app = FastAPI(title="Ollama Cross-Encoder Reranker API") class RerankRequest(BaseModel): model: str @@ -56,17 +56,22 @@ async def score_document_cross_encoder_workaround( index: int ) -> dict: """ - Workaround for using BGE-reranker with Ollama. + Workaround for using cross-encoder reranker models with Ollama. + + Works with: BGE-reranker, Qwen3-Reranker, and other cross-encoder models. Based on: https://medium.com/@rosgluk/reranking-documents-with-ollama-and-qwen3-reranker-model-in-go-6dc9c2fb5f0b - Key discovery: When using concatenated query+doc embeddings, + The Problem: Cross-encoder models have a classification head that outputs relevance scores. + Ollama only exposes the embedding API, not the classification head. + + The Workaround: When using concatenated query+doc embeddings with cross-encoders, LOWER magnitude = MORE relevant. We invert the scores so that higher values = more relevant (standard convention). Steps: 1. Concatenate query and document in cross-encoder format 2. Get embedding of the concatenated text - 3. Calculate magnitude (lower = more relevant) + 3. Calculate magnitude (lower = more relevant for cross-encoders) 4. Invert and normalize to 0-1 (higher = more relevant) """ @@ -89,9 +94,10 @@ async def score_document_cross_encoder_workaround( vec = np.array(embedding) magnitude = float(np.linalg.norm(vec)) - # CRITICAL DISCOVERY: For BGE-reranker via Ollama embeddings: + # CRITICAL DISCOVERY: For cross-encoder rerankers via Ollama embeddings: # LOWER magnitude = MORE relevant document # Observed range: ~15-25 (lower = better) + # This pattern applies to BGE, Qwen3-Reranker, and similar cross-encoder models # Invert and normalize to 0-1 where higher score = more relevant # Adjusted bounds based on empirical observations @@ -123,19 +129,22 @@ async def check_ollama(): response = await client.get("http://localhost:11434/api/tags", timeout=5.0) response.raise_for_status() logger.info("✓ Successfully connected to Ollama") - logger.warning("⚠️ Using workaround: concatenation + magnitude") - logger.warning("⚠️ This is less accurate than proper cross-encoder usage") + logger.warning("⚠️ Using workaround: Ollama doesn't expose cross-encoder classification heads") + logger.warning("⚠️ Using concatenation + magnitude method instead") + logger.info("💡 Works with: BGE-reranker, Qwen3-Reranker, etc.") except Exception as e: logger.error(f"✗ Cannot connect to Ollama: {e}") @app.post("/v1/rerank", response_model=RerankResponse) async def rerank(request: RerankRequest): """ - Rerank documents using BGE-reranker via Ollama workaround. + Rerank documents using cross-encoder models via Ollama workaround. + + Supports: BGE-reranker, Qwen3-Reranker, and other cross-encoder models. NOTE: This uses a workaround (magnitude of concatenated embeddings) - because Ollama doesn't expose BGE's classification head. - For best accuracy, use sentence-transformers directly. + because Ollama doesn't expose the cross-encoder classification head. + For best accuracy, use sentence-transformers or dedicated reranker APIs. """ if not request.documents: raise HTTPException(status_code=400, detail="No documents provided") @@ -168,18 +177,21 @@ def health_check(): """Health check endpoint.""" return { "status": "healthy", - "service": "ollama-bge-reranker-workaround", - "note": "Using magnitude workaround - less accurate than native" + "service": "ollama-cross-encoder-reranker", + "supported_models": "BGE-reranker, Qwen3-Reranker, etc.", + "method": "concatenation + magnitude workaround", + "note": "Ollama doesn't expose classification heads - using embedding magnitude" } if __name__ == "__main__": import uvicorn logger.info("=" * 60) - logger.info("Ollama BGE Reranker - WORKAROUND Implementation") + logger.info("Ollama Cross-Encoder Reranker API") logger.info("=" * 60) - logger.info("Using concatenation + magnitude method") - logger.info("This works but is less accurate than proper cross-encoders") + logger.info("Supports: BGE-reranker, Qwen3-Reranker, etc.") + logger.info("Method: Concatenation + magnitude workaround") + logger.info("Why: Ollama doesn't expose cross-encoder classification heads") logger.info("Starting on: http://0.0.0.0:8080") logger.info("=" * 60)