ollama-utils/scripts/context-optimizer.py

#!/usr/bin/env python3
"""
Ollama Context Optimizer - Find optimal num_ctx for models based on VRAM
Iteratively tests different context sizes to recommend the best setting.
"""

import argparse
import json
import subprocess
import sys
import time
import urllib.request
import urllib.error
from typing import Dict, Optional, Tuple


def get_ollama_url():
    """Get the Ollama API URL."""
    return "http://localhost:11434"


def get_gpu_vram() -> Tuple[Optional[float], Optional[float]]:
    """Get GPU VRAM total and available in GB."""
    import os

    device_paths = [
        "/sys/class/drm/card1/device/",
        "/sys/class/drm/card0/device/",
    ]

    for base_path in device_paths:
        if not os.path.exists(base_path):
            continue

        try:
            with open(base_path + "mem_info_vram_used", "r") as f:
                used = int(f.read().strip()) / 1024 / 1024 / 1024  # GB
            with open(base_path + "mem_info_vram_total", "r") as f:
                total = int(f.read().strip()) / 1024 / 1024 / 1024  # GB

            available = total - used
            return total, available
        except:
            continue

    return None, None


def get_model_info(model_name: str) -> Dict:
    """Get model information including max context capability."""
    try:
        result = subprocess.run(
            ['ollama', 'show', model_name],
            capture_output=True,
            text=True,
            check=True
        )

        info = {
            'max_context': 0,
            'current_num_ctx': 0,
            'params': 'N/A',
            'quant': 'N/A'
        }

        current_section = None
        for line in result.stdout.split('\n'):
            line = line.strip()
            if not line:
                continue

            if line in ["Model", "Parameters"]:
                current_section = line
                continue

            if current_section == "Model":
                parts = line.split(maxsplit=1)
                if len(parts) == 2:
                    k, v = parts[0].lower(), parts[1].strip()
                    if 'context' in k and 'length' in k:
                        if v.isdigit():
                            info['max_context'] = int(v)
                    elif 'context' in k:
                        # Handle "context length" as two words
                        parts2 = line.split()
                        if len(parts2) >= 3 and parts2[-1].isdigit():
                            info['max_context'] = int(parts2[-1])
                    elif 'quantization' in k:
                        info['quant'] = v
                    elif 'parameters' in k:
                        info['params'] = v

            elif current_section == "Parameters":
                if 'num_ctx' in line.lower():
                    parts = line.split(maxsplit=1)
                    if len(parts) == 2 and parts[1].strip().isdigit():
                        info['current_num_ctx'] = int(parts[1].strip())

        return info
    except subprocess.CalledProcessError:
        return {'max_context': 0, 'current_num_ctx': 0, 'params': 'N/A', 'quant': 'N/A'}


def test_context_size(model_name: str, num_ctx: int) -> Optional[Dict]:
    """
    Test a model with a specific context size.
    Returns VRAM usage and offload info, or None if failed.
    """
    url = f"{get_ollama_url()}/api/generate"
    prompt_data = {
        "model": model_name,
        "prompt": "Reply with only: OK",
        "stream": False,
        "options": {
            "num_ctx": num_ctx
        }
    }

    try:
        req = urllib.request.Request(
            url,
            data=json.dumps(prompt_data).encode('utf-8'),
            headers={'Content-Type': 'application/json'}
        )

        # Send request with longer timeout for large contexts
        # Large contexts can take time to allocate
        timeout = 120 if num_ctx > 100000 else 60

        with urllib.request.urlopen(req, timeout=timeout) as response:
            response_data = response.read().decode()

            # Check if response contains error
            try:
                resp_json = json.loads(response_data)
                if 'error' in resp_json:
                    error_msg = resp_json['error']
                    # Return special dict to indicate OOM or other errors
                    return {
                        'vram_gb': 0,
                        'total_gb': 0,
                        'offload_pct': 0,
                        'num_ctx': num_ctx,
                        'error': error_msg
                    }
            except:
                pass

        # Wait for model to stabilize
        time.sleep(0.5)

        # Check /api/ps for VRAM usage
        ps_url = f"{get_ollama_url()}/api/ps"
        with urllib.request.urlopen(ps_url, timeout=5) as r:
            ps_data = json.loads(r.read().decode())
            models = ps_data.get('models', [])

            for m in models:
                if m['name'] == model_name or m['name'].startswith(model_name + ':'):
                    size_bytes = m.get('size', 0)
                    size_vram = m.get('size_vram', 0)

                    vram_gb = size_vram / (1024**3)
                    total_gb = size_bytes / (1024**3)

                    offload_pct = 0
                    if size_bytes > 0:
                        offload_pct = ((size_bytes - size_vram) / size_bytes) * 100

                    return {
                        'vram_gb': vram_gb,
                        'total_gb': total_gb,
                        'offload_pct': offload_pct,
                        'num_ctx': num_ctx
                    }

        return None

    except urllib.error.HTTPError as e:
        # HTTP errors (500, etc.) - often indicates OOM or model loading failure
        try:
            error_body = e.read().decode()
            error_data = json.loads(error_body)
            error_msg = error_data.get('error', str(e))
        except:
            error_msg = f"HTTP {e.code}"

        return {
            'vram_gb': 0,
            'total_gb': 0,
            'offload_pct': 0,
            'num_ctx': num_ctx,
            'error': error_msg
        }

    except urllib.error.URLError as e:
        # Network/timeout errors
        if 'timed out' in str(e).lower():
            error_msg = "Timeout (loading too slow)"
        else:
            error_msg = f"Connection error: {e}"

        return {
            'vram_gb': 0,
            'total_gb': 0,
            'offload_pct': 0,
            'num_ctx': num_ctx,
            'error': error_msg
        }

    except Exception as e:
        # Other unexpected errors
        return {
            'vram_gb': 0,
            'total_gb': 0,
            'offload_pct': 0,
            'num_ctx': num_ctx,
            'error': str(e)[:50]
        }


def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: float) -> Dict:
    """
    Find optimal context size through intelligent testing.
    Uses VRAM measurements to extrapolate optimal size.

    Args:
        model_name: Name of the Ollama model to test
        max_turns: Maximum iterations (None = optimize until convergence)
        overhead_gb: VRAM to keep free for system overhead
    """
    print(f"Analyzing model: {model_name}")
    print("-" * 70)

    # Get model capabilities
    info = get_model_info(model_name)
    max_context = info['max_context']
    current_ctx = info['current_num_ctx']

    print(f"Model: {model_name}")
    print(f"Parameters: {info['params']} ({info['quant']})")
    print(f"Max context capability: {max_context:,}")
    print(f"Current num_ctx: {current_ctx:,}")

    if max_context == 0:
        print("\n✗ Could not determine model's max context capability")
        return {}

    # Get VRAM info
    vram_total, vram_available = get_gpu_vram()
    if vram_total and vram_available is not None:
        print(f"GPU VRAM: {vram_available:.1f} GB available / {vram_total:.1f} GB total")
        print(f"Overhead reserved: {overhead_gb:.1f} GB")
        # Reserve specified overhead from available VRAM (not total)
        target_vram = vram_available - overhead_gb
        print(f"Target VRAM for model: {target_vram:.1f} GB")
    else:
        print("⚠ Could not detect GPU VRAM (testing will continue)")
        target_vram = None

    if max_turns:
        print(f"Testing with max {max_turns} iterations...")
    else:
        print(f"Testing until convergence (num_ctx must be multiple of 2048)...")
    print()

    results = []

    # Turn 1: Test current setting to establish baseline
    # If current_ctx seems too high, start with a safer baseline
    test_ctx = current_ctx if current_ctx > 0 else 8192

    # Safety check: If current_ctx is very large and we know VRAM constraints, start smaller
    if target_vram and current_ctx > 65536:
        # Start with a more conservative baseline
        test_ctx = 16384
        print(f"⚠ Current num_ctx={current_ctx:,} is very large, starting with safer baseline={test_ctx:,}")

    turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
    print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
    result = test_context_size(model_name, test_ctx)

    if result and 'error' not in result:
        results.append(result)
        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
        baseline_vram = result['vram_gb']
        baseline_ctx = test_ctx
    elif result and 'error' in result:
        # First test failed - try with a much smaller context
        error_msg = result['error']
        if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
            print(f"✗ OOM - baseline too large, retrying with num_ctx=8192")
            test_ctx = 8192
            print(f"{turn_label} (retry): Testing num_ctx={test_ctx:,} (safe baseline)...", end=' ', flush=True)
            result = test_context_size(model_name, test_ctx)

            if result and 'error' not in result:
                results.append(result)
                print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
                baseline_vram = result['vram_gb']
                baseline_ctx = test_ctx
            else:
                print("✗ Failed even with minimal context")
                return {
                    'model': model_name,
                    'results': results,
                    'max_context': max_context,
                    'current_ctx': current_ctx,
                    'vram_total': vram_total,
                    'info': info,
                    'error': 'Model failed to load even with minimal context (8K). Check VRAM availability.'
                }
        else:
            print(f"✗ Error: {error_msg[:50]}")
            return {
                'model': model_name,
                'results': results,
                'max_context': max_context,
                'current_ctx': current_ctx,
                'vram_total': vram_total,
                'info': info,
                'error': error_msg
            }
    else:
        print("✗ Failed")
        return {
            'model': model_name,
            'results': results,
            'max_context': max_context,
            'current_ctx': current_ctx,
            'vram_total': vram_total,
            'info': info,
            'error': 'Unknown failure during baseline test'
        }

    # Turn 2: Test a different context to calculate VRAM/context ratio
    # If baseline already shows offloading OR is at max_context, test LOWER
    # Otherwise, test HIGHER
    baseline_has_offload = results[0]['offload_pct'] > 0

    if baseline_has_offload or baseline_ctx >= max_context:
        # Test lower context to find where it fits
        test_ctx_2 = max(8192, baseline_ctx // 2)
        # Round to multiple of 2048
        test_ctx_2 = (test_ctx_2 // 2048) * 2048
        calibration_label = "lower bound"
    else:
        # Try doubling the context or 32K, whichever is smaller
        test_ctx_2 = min(baseline_ctx * 2, 32768, max_context)
        if test_ctx_2 <= baseline_ctx:
            test_ctx_2 = min(baseline_ctx + 16384, max_context)
        # Round to multiple of 2048
        test_ctx_2 = (test_ctx_2 // 2048) * 2048
        calibration_label = "upper bound"

    turn_label = f"Turn 2/{max_turns}" if max_turns else "Turn 2"
    print(f"{turn_label}: Testing num_ctx={test_ctx_2:,} ({calibration_label})...", end=' ', flush=True)
    result = test_context_size(model_name, test_ctx_2)

    if result and 'error' not in result:
        results.append(result)
        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")

        # Calculate VRAM per 1K context tokens (works for both higher and lower tests)
        vram_diff = result['vram_gb'] - baseline_vram
        ctx_diff = test_ctx_2 - baseline_ctx

        if ctx_diff != 0:  # Can be positive or negative
            vram_per_1k_ctx = abs(vram_diff / ctx_diff) * 1000
            print(f"         → Estimated VRAM usage: {vram_per_1k_ctx:.4f} GB per 1K context")

            # Predict optimal context size based on available VRAM
            if target_vram and vram_per_1k_ctx > 0:
                # Find which result has no offload (if any) to use as reference
                ref_result = result if result['offload_pct'] == 0 else (results[0] if results[0]['offload_pct'] == 0 else None)

                if ref_result:
                    # We have a point that fits - extrapolate from there
                    available_for_ctx = target_vram - ref_result['vram_gb']
                    estimated_additional_ctx = (available_for_ctx / vram_per_1k_ctx) * 1000
                    predicted_optimal = ref_result['num_ctx'] + int(estimated_additional_ctx)
                else:
                    # Neither fits - need to find what would fit
                    # Start from the smaller test and work backwards
                    smaller_result = results[0] if results[0]['num_ctx'] < result['num_ctx'] else result
                    vram_needed_reduction = smaller_result['vram_gb'] - target_vram
                    ctx_reduction_needed = (vram_needed_reduction / vram_per_1k_ctx) * 1000
                    predicted_optimal = smaller_result['num_ctx'] - int(ctx_reduction_needed)

                # Round to multiple of 2048 and clamp to valid range
                predicted_optimal = (predicted_optimal // 2048) * 2048
                predicted_optimal = max(2048, min(predicted_optimal, max_context))

                print(f"         → Predicted optimal context: {predicted_optimal:,}")
            else:
                predicted_optimal = None
                vram_per_1k_ctx = None
        else:
            vram_per_1k_ctx = None
            predicted_optimal = None
    else:
        if result and 'error' in result:
            error_msg = result['error']
            if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
                print(f"✗ OOM (out of memory)")
            else:
                print(f"✗ Error: {error_msg[:30]}")
        else:
            print("✗ Failed")
        vram_per_1k_ctx = None
        predicted_optimal = None

    # Remaining turns: Test predicted optimal or use VRAM-based refinement
    # Initialize search bounds based on whether baseline has offload
    if baseline_has_offload:
        # Search downward from baseline to find what fits
        min_ctx = 2048  # Minimum practical context
        max_ctx = baseline_ctx
    else:
        # Search upward from baseline to find max that fits
        min_ctx = baseline_ctx
        max_ctx = max_context

    turn = 2
    while True:
        # Check if we should stop
        if max_turns and turn >= max_turns:
            break

        if predicted_optimal and turn == 2:
            # Turn 3: Test predicted optimal
            test_ctx = predicted_optimal
            turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
            print(f"{turn_label}: Testing num_ctx={test_ctx:,} (predicted optimal)...", end=' ', flush=True)
        else:
            # Use VRAM-based prediction if we have the data
            if vram_per_1k_ctx and target_vram and len(results) > 0:
                # Find the last successful result (no offload)
                last_good = None
                for r in reversed(results):
                    if r['offload_pct'] == 0:
                        last_good = r
                        break

                if last_good and target_vram:
                    # Calculate how much more context we can realistically add
                    available_vram = target_vram - last_good['vram_gb']

                    # Calculate potential additional context
                    additional_ctx = (available_vram / vram_per_1k_ctx) * 1000

                    # If we can only add < 8K context, do small increments
                    if additional_ctx < 8192:
                        # Small increments - round up to next 2048 boundary
                        test_ctx = last_good['num_ctx'] + 2048
                        test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
                    else:
                        # Larger headroom - use 60% of predicted to be conservative
                        test_ctx = last_good['num_ctx'] + int(additional_ctx * 0.6)
                        test_ctx = (test_ctx // 2048) * 2048
                        test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
                else:
                    # No good result yet - binary search
                    test_ctx = (min_ctx + max_ctx) // 2
                    test_ctx = (test_ctx // 2048) * 2048
            else:
                # No VRAM data - fall back to binary search
                test_ctx = (min_ctx + max_ctx) // 2
                test_ctx = (test_ctx // 2048) * 2048

            # Avoid retesting same value
            if any(r['num_ctx'] == test_ctx for r in results):
                # Adjust by 2048
                if test_ctx < max_ctx:
                    test_ctx += 2048
                else:
                    test_ctx -= 2048

                if test_ctx <= min_ctx or test_ctx >= max_ctx:
                    print(f"\nConverged after {turn + 1} turns")
                    break

            turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
            print(f"{turn_label}: Testing num_ctx={test_ctx:,}...", end=' ', flush=True)

        result = test_context_size(model_name, test_ctx)

        if result is None:
            print("✗ Failed (model not found)")
            max_ctx = test_ctx
            continue

        if 'error' in result:
            error_msg = result['error']
            if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
                print(f"✗ OOM (out of memory)")
            elif 'timeout' in error_msg.lower():
                print(f"✗ Timeout")
            else:
                print(f"✗ Error: {error_msg[:30]}")
            max_ctx = test_ctx
            continue

        results.append(result)

        offload_str = f"{result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else "GPU only"
        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {offload_str}")

        # Adjust search bounds
        if result['offload_pct'] > 0:
            max_ctx = test_ctx
        else:
            min_ctx = test_ctx

        # Stop if we're converging (within one step of 2048)
        if max_ctx - min_ctx <= 2048:
            print(f"\nConverged after {turn + 1} turns")
            break

        turn += 1

    return {
        'model': model_name,
        'results': results,
        'max_context': max_context,
        'current_ctx': current_ctx,
        'vram_total': vram_total,
        'info': info
    }


def print_recommendation(analysis: Dict):
    """Print optimization recommendations."""
    if not analysis:
        print("\n✗ No results to analyze")
        return

    # Check for errors first
    if 'error' in analysis:
        print("\n" + "="*70)
        print("OPTIMIZATION FAILED")
        print("="*70)
        print(f"\n✗ Error: {analysis['error']}")
        print(f"\n💡 Suggestions:")
        print(f"   1. Check that the model is installed: ollama list")
        print(f"   2. Ensure sufficient VRAM is available")
        print(f"   3. Try unloading other models: ollama ps")
        print(f"   4. Consider using a smaller model or lower quantization")
        return

    if not analysis.get('results'):
        print("\n✗ No results to analyze")
        return

    results = analysis['results']
    max_context = analysis['max_context']
    current_ctx = analysis['current_ctx']

    print("\n" + "="*70)
    print("OPTIMIZATION RECOMMENDATION")
    print("="*70)

    # Find best context without offloading
    no_offload = [r for r in results if r['offload_pct'] == 0]

    if no_offload:
        # Recommend highest context without offloading
        best = max(no_offload, key=lambda x: x['num_ctx'])

        print(f"\n✓ Recommended num_ctx: {best['num_ctx']:,}")
        print(f"  VRAM usage: {best['vram_gb']:.2f} GB")
        print(f"  Status: Fits entirely in GPU memory")

        if best['num_ctx'] < max_context:
            print(f"\n⚠ Note: Model supports up to {max_context:,} context")
            print(f"  but VRAM limits optimal usage to {best['num_ctx']:,}")

        if current_ctx != best['num_ctx']:
            print(f"\n📝 Suggested Modelfile change:")
            print(f"   Current: PARAMETER num_ctx {current_ctx}")
            print(f"   Optimal: PARAMETER num_ctx {best['num_ctx']}")
    else:
        # All tests had offloading
        print("\n⚠ All tested configurations require CPU offloading")

        # Find least offloading
        least_offload = min(results, key=lambda x: x['offload_pct'])

        print(f"\n  Least offloading at num_ctx={least_offload['num_ctx']:,}")
        print(f"  CPU offload: {least_offload['offload_pct']:.1f}%")
        print(f"  VRAM usage: {least_offload['vram_gb']:.2f} GB")

        print(f"\n💡 Recommendations:")
        print(f"   1. Use lower quantization (Q4 instead of Q5/Q8)")
        print(f"   2. Reduce num_ctx to {least_offload['num_ctx']:,} or lower")
        print(f"   3. Consider a smaller model variant")

    # VRAM efficiency
    print(f"\n📊 Tested context sizes:")
    for r in sorted(results, key=lambda x: x['num_ctx']):
        status = "✓" if r['offload_pct'] == 0 else "✗"
        print(f"   {status} {r['num_ctx']:>6,}: {r['vram_gb']:>5.2f} GB VRAM, "
              f"{r['offload_pct']:>4.1f}% CPU offload")


def main():
    parser = argparse.ArgumentParser(
        description='Optimize Ollama model context size for VRAM constraints',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Optimize with 5 test iterations (default)
  %(prog)s ministral-3:3b-instruct-2512-q5_k_m

  # Use 10 iterations for more precise optimization
  %(prog)s ministral-3:3b-instruct-2512-q5_k_m --turns 10
        """
    )

    parser.add_argument(
        'model',
        help='Model name to optimize'
    )

    parser.add_argument(
        '--turns',
        type=int,
        default=None,
        help='Maximum number of test iterations (default: optimize until convergence)'
    )

    parser.add_argument(
        '--overhead',
        type=float,
        default=1.0,
        help='VRAM overhead to keep free in GB (default: 1.0)'
    )

    args = parser.parse_args()

    if args.turns is not None and args.turns < 2:
        print("✗ Error: --turns must be at least 2")
        sys.exit(1)

    # Check if ollama is available
    try:
        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
        sys.exit(1)

    # Run optimization
    analysis = find_optimal_context(args.model, args.turns, args.overhead)

    # Print recommendations
    print_recommendation(analysis)


if __name__ == '__main__':
    main()