diff --git a/scripts/context-optimizer.py b/scripts/context-optimizer.py index 8eb9654..9866abd 100755 --- a/scripts/context-optimizer.py +++ b/scripts/context-optimizer.py @@ -266,7 +266,15 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: results = [] # Turn 1: Test current setting to establish baseline + # If current_ctx seems too high, start with a safer baseline test_ctx = current_ctx if current_ctx > 0 else 8192 + + # Safety check: If current_ctx is very large and we know VRAM constraints, start smaller + if target_vram and current_ctx > 65536: + # Start with a more conservative baseline + test_ctx = 16384 + print(f"⚠ Current num_ctx={current_ctx:,} is very large, starting with safer baseline={test_ctx:,}") + turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1" print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True) result = test_context_size(model_name, test_ctx) @@ -276,9 +284,53 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only") baseline_vram = result['vram_gb'] baseline_ctx = test_ctx + elif result and 'error' in result: + # First test failed - try with a much smaller context + error_msg = result['error'] + if 'memory' in error_msg.lower() or 'oom' in error_msg.lower(): + print(f"✗ OOM - baseline too large, retrying with num_ctx=8192") + test_ctx = 8192 + print(f"{turn_label} (retry): Testing num_ctx={test_ctx:,} (safe baseline)...", end=' ', flush=True) + result = test_context_size(model_name, test_ctx) + + if result and 'error' not in result: + results.append(result) + print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only") + baseline_vram = result['vram_gb'] + baseline_ctx = test_ctx + else: + print("✗ Failed even with minimal context") + return { + 'model': model_name, + 'results': results, + 'max_context': max_context, + 'current_ctx': current_ctx, + 'vram_total': vram_total, + 'info': info, + 'error': 'Model failed to load even with minimal context (8K). Check VRAM availability.' + } + else: + print(f"✗ Error: {error_msg[:50]}") + return { + 'model': model_name, + 'results': results, + 'max_context': max_context, + 'current_ctx': current_ctx, + 'vram_total': vram_total, + 'info': info, + 'error': error_msg + } else: print("✗ Failed") - return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info} + return { + 'model': model_name, + 'results': results, + 'max_context': max_context, + 'current_ctx': current_ctx, + 'vram_total': vram_total, + 'info': info, + 'error': 'Unknown failure during baseline test' + } # Turn 2: Test a higher context to calculate VRAM/context ratio # Try doubling the context or 32K, whichever is smaller @@ -445,7 +497,24 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: def print_recommendation(analysis: Dict): """Print optimization recommendations.""" - if not analysis or not analysis.get('results'): + if not analysis: + print("\n✗ No results to analyze") + return + + # Check for errors first + if 'error' in analysis: + print("\n" + "="*70) + print("OPTIMIZATION FAILED") + print("="*70) + print(f"\n✗ Error: {analysis['error']}") + print(f"\n💡 Suggestions:") + print(f" 1. Check that the model is installed: ollama list") + print(f" 2. Ensure sufficient VRAM is available") + print(f" 3. Try unloading other models: ollama ps") + print(f" 4. Consider using a smaller model or lower quantization") + return + + if not analysis.get('results'): print("\n✗ No results to analyze") return diff --git a/web_app.py b/web_app.py index 4c299b9..b3e140b 100644 --- a/web_app.py +++ b/web_app.py @@ -944,12 +944,28 @@ def api_optimize_context(model_name): # Use the existing find_optimal_context function from context-optimizer.py result = context_optimizer_module.find_optimal_context(model_name, max_turns=max_turns, overhead_gb=overhead_gb) - if not result or 'results' not in result: + if not result: return jsonify({ 'success': False, 'error': 'Optimization failed or no results returned' }) + # Check if optimization encountered an error + if 'error' in result: + return jsonify({ + 'success': False, + 'error': result['error'], + 'model': model_name, + 'max_context': result.get('max_context', 0), + 'current_context': result.get('current_ctx', 0) + }) + + if 'results' not in result or len(result['results']) == 0: + return jsonify({ + 'success': False, + 'error': 'No test results available. Model may have failed to load.' + }) + # Extract data from results test_results = [] optimal_context = 0