fix: silent failures for context optimizer
This commit is contained in:
@@ -266,7 +266,15 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:
|
||||
results = []
|
||||
|
||||
# Turn 1: Test current setting to establish baseline
|
||||
# If current_ctx seems too high, start with a safer baseline
|
||||
test_ctx = current_ctx if current_ctx > 0 else 8192
|
||||
|
||||
# Safety check: If current_ctx is very large and we know VRAM constraints, start smaller
|
||||
if target_vram and current_ctx > 65536:
|
||||
# Start with a more conservative baseline
|
||||
test_ctx = 16384
|
||||
print(f"⚠ Current num_ctx={current_ctx:,} is very large, starting with safer baseline={test_ctx:,}")
|
||||
|
||||
turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
|
||||
print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
|
||||
result = test_context_size(model_name, test_ctx)
|
||||
@@ -276,9 +284,53 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:
|
||||
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
|
||||
baseline_vram = result['vram_gb']
|
||||
baseline_ctx = test_ctx
|
||||
elif result and 'error' in result:
|
||||
# First test failed - try with a much smaller context
|
||||
error_msg = result['error']
|
||||
if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
|
||||
print(f"✗ OOM - baseline too large, retrying with num_ctx=8192")
|
||||
test_ctx = 8192
|
||||
print(f"{turn_label} (retry): Testing num_ctx={test_ctx:,} (safe baseline)...", end=' ', flush=True)
|
||||
result = test_context_size(model_name, test_ctx)
|
||||
|
||||
if result and 'error' not in result:
|
||||
results.append(result)
|
||||
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
|
||||
baseline_vram = result['vram_gb']
|
||||
baseline_ctx = test_ctx
|
||||
else:
|
||||
print("✗ Failed even with minimal context")
|
||||
return {
|
||||
'model': model_name,
|
||||
'results': results,
|
||||
'max_context': max_context,
|
||||
'current_ctx': current_ctx,
|
||||
'vram_total': vram_total,
|
||||
'info': info,
|
||||
'error': 'Model failed to load even with minimal context (8K). Check VRAM availability.'
|
||||
}
|
||||
else:
|
||||
print(f"✗ Error: {error_msg[:50]}")
|
||||
return {
|
||||
'model': model_name,
|
||||
'results': results,
|
||||
'max_context': max_context,
|
||||
'current_ctx': current_ctx,
|
||||
'vram_total': vram_total,
|
||||
'info': info,
|
||||
'error': error_msg
|
||||
}
|
||||
else:
|
||||
print("✗ Failed")
|
||||
return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info}
|
||||
return {
|
||||
'model': model_name,
|
||||
'results': results,
|
||||
'max_context': max_context,
|
||||
'current_ctx': current_ctx,
|
||||
'vram_total': vram_total,
|
||||
'info': info,
|
||||
'error': 'Unknown failure during baseline test'
|
||||
}
|
||||
|
||||
# Turn 2: Test a higher context to calculate VRAM/context ratio
|
||||
# Try doubling the context or 32K, whichever is smaller
|
||||
@@ -445,7 +497,24 @@ def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb:
|
||||
|
||||
def print_recommendation(analysis: Dict):
|
||||
"""Print optimization recommendations."""
|
||||
if not analysis or not analysis.get('results'):
|
||||
if not analysis:
|
||||
print("\n✗ No results to analyze")
|
||||
return
|
||||
|
||||
# Check for errors first
|
||||
if 'error' in analysis:
|
||||
print("\n" + "="*70)
|
||||
print("OPTIMIZATION FAILED")
|
||||
print("="*70)
|
||||
print(f"\n✗ Error: {analysis['error']}")
|
||||
print(f"\n💡 Suggestions:")
|
||||
print(f" 1. Check that the model is installed: ollama list")
|
||||
print(f" 2. Ensure sufficient VRAM is available")
|
||||
print(f" 3. Try unloading other models: ollama ps")
|
||||
print(f" 4. Consider using a smaller model or lower quantization")
|
||||
return
|
||||
|
||||
if not analysis.get('results'):
|
||||
print("\n✗ No results to analyze")
|
||||
return
|
||||
|
||||
|
||||
18
web_app.py
18
web_app.py
@@ -944,12 +944,28 @@ def api_optimize_context(model_name):
|
||||
# Use the existing find_optimal_context function from context-optimizer.py
|
||||
result = context_optimizer_module.find_optimal_context(model_name, max_turns=max_turns, overhead_gb=overhead_gb)
|
||||
|
||||
if not result or 'results' not in result:
|
||||
if not result:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'Optimization failed or no results returned'
|
||||
})
|
||||
|
||||
# Check if optimization encountered an error
|
||||
if 'error' in result:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': result['error'],
|
||||
'model': model_name,
|
||||
'max_context': result.get('max_context', 0),
|
||||
'current_context': result.get('current_ctx', 0)
|
||||
})
|
||||
|
||||
if 'results' not in result or len(result['results']) == 0:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': 'No test results available. Model may have failed to load.'
|
||||
})
|
||||
|
||||
# Extract data from results
|
||||
test_results = []
|
||||
optimal_context = 0
|
||||
|
||||
Reference in New Issue
Block a user