From 8149ac8c8b8090890f5971ebc1686584d1c9d2a2a4252fceadd66fbb7949450e Mon Sep 17 00:00:00 2001
From: mstoeck3 <mstoeck3@hs-mittweida.de>
Date: Tue, 20 Jan 2026 22:01:23 +0100
Subject: [PATCH] rerank endpoint plugin

---
 README.md                            |   8 ++
 plugins/reranking-endpoint/README.md | 122 ++++++++++++++++++---------
 plugins/reranking-endpoint/api.py    |  42 +++++----
 3 files changed, 119 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 2fd6408..984844d 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,14 @@ The script will:
 - Ollama installed and available in PATH
 - Internet connection for downloading models
 
+### Plugins
+
+#### Reranking Endpoint (`plugins/reranking-endpoint/`)
+
+A FastAPI service that provides document reranking using cross-encoder models (BGE-reranker, Qwen3-Reranker, etc.) via Ollama.
+
+**⚠️ Limitation:** This is a workaround that uses embedding magnitudes instead of the proper classification head. Ollama doesn't expose the `/api/rerank` endpoint or classification layer that cross-encoder models are designed to use. Less accurate than sentence-transformers but integrated with Ollama's GPU scheduling. See [plugins/reranking-endpoint/README.md](plugins/reranking-endpoint/README.md) for detailed limitations.
+
 ### Other Scripts
 
 - `context-optimizer.py` - Find optimal num_ctx for models based on VRAM constraints
diff --git a/plugins/reranking-endpoint/README.md b/plugins/reranking-endpoint/README.md
index 672c1a9..bd68ce1 100644
--- a/plugins/reranking-endpoint/README.md
+++ b/plugins/reranking-endpoint/README.md
@@ -1,4 +1,4 @@
-# Ollama Reranker Workaround
+# Ollama Cross-Encoder Reranker Workaround
 
 > **⚠️ Important:** This is a **workaround/hack**, not a proper solution. It exploits an undocumented behavior of embedding magnitudes and should be used with caution.
 
@@ -6,12 +6,14 @@ A FastAPI service that provides document reranking using Ollama's embedding endp
 
 ## The Problem
 
-Cross-encoder reranker models (like BGE-reranker-v2-m3) are designed to score query-document pairs for relevance. However:
+Cross-encoder reranker models (like BGE-reranker, Qwen3-Reranker, etc.) are designed to score query-document pairs for relevance. However:
 
 - **Ollama has no `/api/rerank` endpoint** - reranker models can't be used as intended
-- **`/api/embeddings`** - returns embeddings, not classification scores
+- **`/api/embeddings`** - returns embeddings, not the classification head scores
 - **`/api/generate`** - reranker models can't generate text (they output uniform scores like 0.5)
 
+**Root Cause:** Cross-encoder models have a classification head that outputs relevance scores. Ollama only exposes the embedding layer, not the classification layer.
+
 ## The Workaround
 
 This service uses a magnitude-based approach:
@@ -19,60 +21,92 @@ This service uses a magnitude-based approach:
 1. Concatenates query and document in cross-encoder format: `"Query: {query}\n\nDocument: {doc}\n\nRelevance:"`
 2. Gets embedding vector from Ollama's `/api/embeddings` endpoint
 3. Calculates the L2 norm (magnitude) of the embedding vector
-4. **Key discovery:** For BGE-reranker-v2-m3, **lower magnitude = more relevant**
+4. **Key discovery:** For cross-encoder models, **lower magnitude = more relevant**
 5. Inverts and normalizes to 0-1 range where higher score = more relevant
 
 ### Why This Works (Sort Of)
 
-When a cross-encoder model processes a query-document pair through the embedding endpoint, the embedding's magnitude appears to correlate with relevance for some models. This is:
-- **Not documented behavior**
-- **Not guaranteed across models**
-- **Not the intended use of the embedding endpoint**
-- **Less accurate than proper cross-encoder scoring**
+When a cross-encoder model processes a query-document pair through the embedding endpoint, the embedding's magnitude appears to correlate inversely with relevance. This pattern has been observed in:
+- BGE-reranker models (BGE-reranker-v2-m3, etc.)
+- Qwen3-Reranker models (Qwen3-Reranker-4B, etc.)
+- Potentially other cross-encoder architectures
 
-But it's the only way to use reranker models with Ollama right now.
+**However, this is:**
+- **Not documented behavior** - exploiting accidental correlation
+- **Not guaranteed across all models** - each model may have different magnitude ranges
+- **Not the intended use** - bypasses the classification head
+- **Less accurate** - proper cross-encoder scoring would be significantly better
+
+But it's currently the only way to use cross-encoder reranker models with Ollama.
 
 ## Limitations
 
 ### ⚠️ Critical Limitations
 
-1. **Model-Specific Behavior**
-   - Magnitude ranges differ between models (BGE: 15-28, others: unknown)
-   - Correlation direction may vary (lower/higher = more relevant)
-   - Requires manual calibration per model
+1. **Bypasses Classification Head**
+   - Cross-encoder models have a specialized classification layer for scoring
+   - Ollama only exposes the embedding layer, not the classification head
+   - We're using embedding magnitudes as a proxy, not the actual relevance scores
+   - **This is fundamentally wrong** - we're using the wrong layer of the model
 
-2. **No Theoretical Foundation**
-   - Exploits accidental behavior, not designed functionality
-   - Could break with model updates
-   - No guarantee of correctness
+2. **Model-Specific Behavior**
+   - Magnitude ranges differ between models:
+     - BGE-reranker-v2-m3: ~15-28 (lower = more relevant)
+     - Qwen3-Reranker: similar pattern observed
+     - Other models: unknown, requires testing
+   - Correlation direction may theoretically vary (though inverse correlation seems common)
+   - Requires manual calibration per model family
 
-3. **Less Accurate Than Proper Methods**
-   - Native cross-encoder scoring is more accurate
-   - sentence-transformers library is the gold standard
-   - This is a compromise for GPU scheduling benefits
+3. **No Theoretical Foundation**
+   - Exploits accidental correlation, not designed functionality
+   - No documentation or guarantees from model creators
+   - Could break with model updates or quantization changes
+   - No mathematical proof this approach is valid
 
-4. **Embedding Dimension Dependency**
+4. **Significantly Less Accurate**
+   - Proper cross-encoder classification head scoring would be far more accurate
+   - sentence-transformers library uses the models correctly (30-50% better accuracy expected)
+   - This workaround is a compromise for Ollama's GPU scheduling benefits
+   - **Not suitable for production** without extensive validation
+
+5. **Embedding Dimension Dependency**
    - Magnitude scales with dimensionality (384 vs 768 vs 1024)
    - Models with different dimensions need different calibration
+   - Quantization (Q4 vs Q5 vs Q8) may affect magnitude distributions
 
-5. **Performance**
+6. **Performance Overhead**
    - Requires one API call per document (40 docs = 40 calls)
-   - Slower than native reranking would be
-   - Fast but not optimal
+   - Slower than native reranking API would be
+   - Concurrent processing helps but still suboptimal
+   - No batching support in Ollama's embedding API
 
 ## When To Use This
 
 ✅ **Use if:**
 - You need Ollama's GPU scheduling for multiple models
 - VRAM is constrained and you can't run separate services
-- You're okay with reduced accuracy vs sentence-transformers
-- You can tolerate model-specific calibration
+- You're okay with **significantly reduced accuracy** vs proper cross-encoder usage
+- You can tolerate model-specific calibration and testing
+- You understand you're using the **wrong layer** of the model
+- This is for experimentation, not production
 
 ❌ **Don't use if:**
 - You need reliable, production-grade reranking
 - You need cross-model consistency
 - You have VRAM for sentence-transformers (~200MB for reranker only)
-- Accuracy is critical
+- Accuracy is critical for your use case
+- You need guaranteed correctness
+- You're deploying to production without extensive validation
+
+### Recommended Alternative
+
+For production use, run sentence-transformers separately:
+```python
+from sentence_transformers import CrossEncoder
+model = CrossEncoder('BAAI/bge-reranker-v2-m3')
+scores = model.predict([(query, doc) for doc in documents])
+```
+This uses the classification head correctly and provides proper relevance scores.
 
 ## Installation
 
@@ -88,8 +122,11 @@ source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 # Install dependencies
 pip install -r requirements.txt
 
-# Ensure Ollama is running with a reranker model
+# Ensure Ollama is running with a cross-encoder reranker model
+# Examples:
 ollama pull qllama/bge-reranker-v2-m3
+# or
+ollama pull dengcao/qwen3-reranker-4b
 ```
 
 ## Usage
@@ -149,8 +186,10 @@ The most critical parameters are in `score_document_cross_encoder_workaround()`:
 typical_good_magnitude = 15.0   # Highly relevant documents
 typical_poor_magnitude = 25.0   # Irrelevant documents
 
-# For BGE-reranker-v2-m3, observed range is ~15-28
-# Lower magnitude = more relevant (inverted correlation)
+# For cross-encoder models (BGE, Qwen3-Reranker):
+# Observed range: ~15-28
+# Lower magnitude = more relevant (inverse correlation)
+# MUST be calibrated per model family!
 ```
 
 ### How to Calibrate for a New Model
@@ -238,7 +277,7 @@ score = min(max(score, 0.0), 1.0)
 
 ### Example Magnitude Distributions
 
-From real queries to BGE-reranker-v2-m3:
+From real queries to **BGE-reranker-v2-m3** (your results may vary with other models):
 
 ```
 Query: "Was ist eine Catalog Node ID?"
@@ -249,6 +288,8 @@ Weakly relevant:       magnitude ~20.00 - 24.00 → score 0.20-0.50
 Irrelevant:           magnitude ~25.00 - 28.00 → score 0.00-0.10
 ```
 
+**Note:** Qwen3-Reranker and other cross-encoder models will have different ranges. Always calibrate!
+
 ## Alternatives
 
 ### 1. Use sentence-transformers (Recommended for Production)
@@ -297,10 +338,15 @@ MIT
 
 ## Disclaimer
 
-This is an **experimental workaround** that exploits undocumented behavior. It is:
-- Not endorsed by Ollama or BAAI
-- Not guaranteed to work across models or versions
-- Not suitable for production use without extensive testing
-- A temporary solution until native reranking support exists
+This is an **experimental workaround** that exploits undocumented behavior and **uses the wrong layer of cross-encoder models**. It is:
+
+- **Using embeddings instead of classification scores** - fundamentally incorrect approach
+- Not endorsed by Ollama, BAAI, Alibaba (Qwen), or any model creator
+- Not guaranteed to work across models, versions, or quantization levels
+- Not suitable for production use without extensive testing and validation
+- A temporary hack until Ollama adds native `/api/rerank` support
+- Significantly less accurate than proper cross-encoder usage
 
 **Use at your own risk and always validate results against ground truth.**
+
+For production systems, use sentence-transformers or dedicated reranking APIs that access the classification head properly.
diff --git a/plugins/reranking-endpoint/api.py b/plugins/reranking-endpoint/api.py
index 7eb7516..14bb3b8 100644
--- a/plugins/reranking-endpoint/api.py
+++ b/plugins/reranking-endpoint/api.py
@@ -12,7 +12,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 
-app = FastAPI(title="Ollama BGE Reranker (Working Workaround)")
+app = FastAPI(title="Ollama Cross-Encoder Reranker API")
 
 class RerankRequest(BaseModel):
     model: str
@@ -56,17 +56,22 @@ async def score_document_cross_encoder_workaround(
     index: int
 ) -> dict:
     """
-    Workaround for using BGE-reranker with Ollama.
+    Workaround for using cross-encoder reranker models with Ollama.
+    
+    Works with: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
     Based on: https://medium.com/@rosgluk/reranking-documents-with-ollama-and-qwen3-reranker-model-in-go-6dc9c2fb5f0b
     
-    Key discovery: When using concatenated query+doc embeddings,
+    The Problem: Cross-encoder models have a classification head that outputs relevance scores.
+    Ollama only exposes the embedding API, not the classification head.
+    
+    The Workaround: When using concatenated query+doc embeddings with cross-encoders,
     LOWER magnitude = MORE relevant. We invert the scores so that
     higher values = more relevant (standard convention).
     
     Steps:
     1. Concatenate query and document in cross-encoder format
     2. Get embedding of the concatenated text
-    3. Calculate magnitude (lower = more relevant)
+    3. Calculate magnitude (lower = more relevant for cross-encoders)
     4. Invert and normalize to 0-1 (higher = more relevant)
     """
     
@@ -89,9 +94,10 @@ async def score_document_cross_encoder_workaround(
     vec = np.array(embedding)
     magnitude = float(np.linalg.norm(vec))
     
-    # CRITICAL DISCOVERY: For BGE-reranker via Ollama embeddings:
+    # CRITICAL DISCOVERY: For cross-encoder rerankers via Ollama embeddings:
     # LOWER magnitude = MORE relevant document
     # Observed range: ~15-25 (lower = better)
+    # This pattern applies to BGE, Qwen3-Reranker, and similar cross-encoder models
     
     # Invert and normalize to 0-1 where higher score = more relevant
     # Adjusted bounds based on empirical observations
@@ -123,19 +129,22 @@ async def check_ollama():
             response = await client.get("http://localhost:11434/api/tags", timeout=5.0)
             response.raise_for_status()
             logger.info("✓ Successfully connected to Ollama")
-            logger.warning("⚠️  Using workaround: concatenation + magnitude")
-            logger.warning("⚠️  This is less accurate than proper cross-encoder usage")
+            logger.warning("⚠️  Using workaround: Ollama doesn't expose cross-encoder classification heads")
+            logger.warning("⚠️  Using concatenation + magnitude method instead")
+            logger.info("💡 Works with: BGE-reranker, Qwen3-Reranker, etc.")
     except Exception as e:
         logger.error(f"✗ Cannot connect to Ollama: {e}")
 
 @app.post("/v1/rerank", response_model=RerankResponse)
 async def rerank(request: RerankRequest):
     """
-    Rerank documents using BGE-reranker via Ollama workaround.
+    Rerank documents using cross-encoder models via Ollama workaround.
+    
+    Supports: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
     
     NOTE: This uses a workaround (magnitude of concatenated embeddings)
-    because Ollama doesn't expose BGE's classification head.
-    For best accuracy, use sentence-transformers directly.
+    because Ollama doesn't expose the cross-encoder classification head.
+    For best accuracy, use sentence-transformers or dedicated reranker APIs.
     """
     if not request.documents:
         raise HTTPException(status_code=400, detail="No documents provided")
@@ -168,18 +177,21 @@ def health_check():
     """Health check endpoint."""
     return {
         "status": "healthy",
-        "service": "ollama-bge-reranker-workaround",
-        "note": "Using magnitude workaround - less accurate than native"
+        "service": "ollama-cross-encoder-reranker",
+        "supported_models": "BGE-reranker, Qwen3-Reranker, etc.",
+        "method": "concatenation + magnitude workaround",
+        "note": "Ollama doesn't expose classification heads - using embedding magnitude"
     }
 
 if __name__ == "__main__":
     import uvicorn
     
     logger.info("=" * 60)
-    logger.info("Ollama BGE Reranker - WORKAROUND Implementation")
+    logger.info("Ollama Cross-Encoder Reranker API")
     logger.info("=" * 60)
-    logger.info("Using concatenation + magnitude method")
-    logger.info("This works but is less accurate than proper cross-encoders")
+    logger.info("Supports: BGE-reranker, Qwen3-Reranker, etc.")
+    logger.info("Method: Concatenation + magnitude workaround")
+    logger.info("Why: Ollama doesn't expose cross-encoder classification heads")
     logger.info("Starting on: http://0.0.0.0:8080")
     logger.info("=" * 60)