fix: silent failures for context optimizer

2026-01-19 12:19:03 +01:00
parent f559170960
commit b03bd70b81
2 changed files with 88 additions and 3 deletions
--- a/scripts/context-optimizer.py
+++ b/scripts/context-optimizer.py
@@ -266,7 +266,15 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:
    results = []
    
    # Turn 1: Test current setting to establish baseline
+    # If current_ctx seems too high, start with a safer baseline
    test_ctx = current_ctx if current_ctx > 0 else 8192
+    
+    # Safety check: If current_ctx is very large and we know VRAM constraints, start smaller
+    if target_vram and current_ctx > 65536:
+        # Start with a more conservative baseline
+        test_ctx = 16384
+        print(f"⚠ Current num_ctx={current_ctx:,} is very large, starting with safer baseline={test_ctx:,}")
+    
    turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
    print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
    result = test_context_size(model_name, test_ctx)
@@ -276,9 +284,53 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:
        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
        baseline_vram = result['vram_gb']
        baseline_ctx = test_ctx
+    elif result and 'error' in result:
+        # First test failed - try with a much smaller context
+        error_msg = result['error']
+        if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
+            print(f"✗ OOM - baseline too large, retrying with num_ctx=8192")
+            test_ctx = 8192
+            print(f"{turn_label} (retry): Testing num_ctx={test_ctx:,} (safe baseline)...", end=' ', flush=True)
+            result = test_context_size(model_name, test_ctx)
+            
+            if result and 'error' not in result:
+                results.append(result)
+                print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
+                baseline_vram = result['vram_gb']
+                baseline_ctx = test_ctx
+            else:
+                print("✗ Failed even with minimal context")
+                return {
+                    'model': model_name,
+                    'results': results,
+                    'max_context': max_context,
+                    'current_ctx': current_ctx,
+                    'vram_total': vram_total,
+                    'info': info,
+                    'error': 'Model failed to load even with minimal context (8K). Check VRAM availability.'
+                }
+        else:
+            print(f"✗ Error: {error_msg[:50]}")
+            return {
+                'model': model_name,
+                'results': results,
+                'max_context': max_context,
+                'current_ctx': current_ctx,
+                'vram_total': vram_total,
+                'info': info,
+                'error': error_msg
+            }
    else:
        print("✗ Failed")
-        return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info}
+        return {
+            'model': model_name,
+            'results': results,
+            'max_context': max_context,
+            'current_ctx': current_ctx,
+            'vram_total': vram_total,
+            'info': info,
+            'error': 'Unknown failure during baseline test'
+        }
    
    # Turn 2: Test a higher context to calculate VRAM/context ratio
    # Try doubling the context or 32K, whichever is smaller
@@ -445,7 +497,24 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:

 def print_recommendation(analysis: Dict):
    """Print optimization recommendations."""
-    if not analysis or not analysis.get('results'):
+    if not analysis:
+        print("\n✗ No results to analyze")
+        return
+    
+    # Check for errors first
+    if 'error' in analysis:
+        print("\n" + "="*70)
+        print("OPTIMIZATION FAILED")
+        print("="*70)
+        print(f"\n✗ Error: {analysis['error']}")
+        print(f"\n💡 Suggestions:")
+        print(f"   1. Check that the model is installed: ollama list")
+        print(f"   2. Ensure sufficient VRAM is available")
+        print(f"   3. Try unloading other models: ollama ps")
+        print(f"   4. Consider using a smaller model or lower quantization")
+        return
+    
+    if not analysis.get('results'):
        print("\n✗ No results to analyze")
        return
    
--- a/web_app.py
+++ b/web_app.py
@@ -944,12 +944,28 @@ def api_optimize_context(model_name):
        # Use the existing find_optimal_context function from context-optimizer.py
        result = context_optimizer_module.find_optimal_context(model_name, max_turns=max_turns, overhead_gb=overhead_gb)
        
-        if not result or 'results' not in result:
+        if not result:
            return jsonify({
                'success': False,
                'error': 'Optimization failed or no results returned'
            })
        
+        # Check if optimization encountered an error
+        if 'error' in result:
+            return jsonify({
+                'success': False,
+                'error': result['error'],
+                'model': model_name,
+                'max_context': result.get('max_context', 0),
+                'current_context': result.get('current_ctx', 0)
+            })
+        
+        if 'results' not in result or len(result['results']) == 0:
+            return jsonify({
+                'success': False,
+                'error': 'No test results available. Model may have failed to load.'
+            })
+        
        # Extract data from results
        test_results = []
        optimal_context = 0