rerank endpoint plugin
This commit is contained in:
@@ -12,7 +12,7 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="Ollama BGE Reranker (Working Workaround)")
|
||||
app = FastAPI(title="Ollama Cross-Encoder Reranker API")
|
||||
|
||||
class RerankRequest(BaseModel):
|
||||
model: str
|
||||
@@ -56,17 +56,22 @@ async def score_document_cross_encoder_workaround(
|
||||
index: int
|
||||
) -> dict:
|
||||
"""
|
||||
Workaround for using BGE-reranker with Ollama.
|
||||
Workaround for using cross-encoder reranker models with Ollama.
|
||||
|
||||
Works with: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
|
||||
Based on: https://medium.com/@rosgluk/reranking-documents-with-ollama-and-qwen3-reranker-model-in-go-6dc9c2fb5f0b
|
||||
|
||||
Key discovery: When using concatenated query+doc embeddings,
|
||||
The Problem: Cross-encoder models have a classification head that outputs relevance scores.
|
||||
Ollama only exposes the embedding API, not the classification head.
|
||||
|
||||
The Workaround: When using concatenated query+doc embeddings with cross-encoders,
|
||||
LOWER magnitude = MORE relevant. We invert the scores so that
|
||||
higher values = more relevant (standard convention).
|
||||
|
||||
Steps:
|
||||
1. Concatenate query and document in cross-encoder format
|
||||
2. Get embedding of the concatenated text
|
||||
3. Calculate magnitude (lower = more relevant)
|
||||
3. Calculate magnitude (lower = more relevant for cross-encoders)
|
||||
4. Invert and normalize to 0-1 (higher = more relevant)
|
||||
"""
|
||||
|
||||
@@ -89,9 +94,10 @@ async def score_document_cross_encoder_workaround(
|
||||
vec = np.array(embedding)
|
||||
magnitude = float(np.linalg.norm(vec))
|
||||
|
||||
# CRITICAL DISCOVERY: For BGE-reranker via Ollama embeddings:
|
||||
# CRITICAL DISCOVERY: For cross-encoder rerankers via Ollama embeddings:
|
||||
# LOWER magnitude = MORE relevant document
|
||||
# Observed range: ~15-25 (lower = better)
|
||||
# This pattern applies to BGE, Qwen3-Reranker, and similar cross-encoder models
|
||||
|
||||
# Invert and normalize to 0-1 where higher score = more relevant
|
||||
# Adjusted bounds based on empirical observations
|
||||
@@ -123,19 +129,22 @@ async def check_ollama():
|
||||
response = await client.get("http://localhost:11434/api/tags", timeout=5.0)
|
||||
response.raise_for_status()
|
||||
logger.info("✓ Successfully connected to Ollama")
|
||||
logger.warning("⚠️ Using workaround: concatenation + magnitude")
|
||||
logger.warning("⚠️ This is less accurate than proper cross-encoder usage")
|
||||
logger.warning("⚠️ Using workaround: Ollama doesn't expose cross-encoder classification heads")
|
||||
logger.warning("⚠️ Using concatenation + magnitude method instead")
|
||||
logger.info("💡 Works with: BGE-reranker, Qwen3-Reranker, etc.")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ Cannot connect to Ollama: {e}")
|
||||
|
||||
@app.post("/v1/rerank", response_model=RerankResponse)
|
||||
async def rerank(request: RerankRequest):
|
||||
"""
|
||||
Rerank documents using BGE-reranker via Ollama workaround.
|
||||
Rerank documents using cross-encoder models via Ollama workaround.
|
||||
|
||||
Supports: BGE-reranker, Qwen3-Reranker, and other cross-encoder models.
|
||||
|
||||
NOTE: This uses a workaround (magnitude of concatenated embeddings)
|
||||
because Ollama doesn't expose BGE's classification head.
|
||||
For best accuracy, use sentence-transformers directly.
|
||||
because Ollama doesn't expose the cross-encoder classification head.
|
||||
For best accuracy, use sentence-transformers or dedicated reranker APIs.
|
||||
"""
|
||||
if not request.documents:
|
||||
raise HTTPException(status_code=400, detail="No documents provided")
|
||||
@@ -168,18 +177,21 @@ def health_check():
|
||||
"""Health check endpoint."""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "ollama-bge-reranker-workaround",
|
||||
"note": "Using magnitude workaround - less accurate than native"
|
||||
"service": "ollama-cross-encoder-reranker",
|
||||
"supported_models": "BGE-reranker, Qwen3-Reranker, etc.",
|
||||
"method": "concatenation + magnitude workaround",
|
||||
"note": "Ollama doesn't expose classification heads - using embedding magnitude"
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Ollama BGE Reranker - WORKAROUND Implementation")
|
||||
logger.info("Ollama Cross-Encoder Reranker API")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Using concatenation + magnitude method")
|
||||
logger.info("This works but is less accurate than proper cross-encoders")
|
||||
logger.info("Supports: BGE-reranker, Qwen3-Reranker, etc.")
|
||||
logger.info("Method: Concatenation + magnitude workaround")
|
||||
logger.info("Why: Ollama doesn't expose cross-encoder classification heads")
|
||||
logger.info("Starting on: http://0.0.0.0:8080")
|
||||
logger.info("=" * 60)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user