ollama-utils/scripts/vram-test.py

#!/usr/bin/env python3
"""
Ollama VRAM Test - Evaluate if models fit in VRAM
Tests models with their configured parameters and reports VRAM usage and CPU offloading.
"""

import argparse
import json
import subprocess
import sys
import time
import urllib.request
from typing import Dict, List, Optional


def get_ollama_url():
    """Get the Ollama API URL."""
    return "http://localhost:11434"


def get_installed_models() -> List[str]:
    """Get list of installed Ollama models."""
    try:
        result = subprocess.run(
            ['ollama', 'list'],
            capture_output=True,
            text=True,
            check=True
        )

        models = []
        for line in result.stdout.strip().split('\n')[1:]:  # Skip header
            if line.strip():
                name = line.split()[0]
                models.append(name)

        return models
    except subprocess.CalledProcessError:
        return []


def get_model_info(model_name: str) -> Dict:
    """Get model information from ollama show."""
    try:
        result = subprocess.run(
            ['ollama', 'show', model_name],
            capture_output=True,
            text=True,
            check=True
        )

        info = {
            'size': 'N/A',
            'quant': 'N/A',
            'num_ctx': 'N/A',
            'params': 'N/A'
        }

        current_section = None
        for line in result.stdout.split('\n'):
            line = line.strip()
            if not line:
                continue

            if line in ["Model", "Parameters"]:
                current_section = line
                continue

            if current_section == "Model":
                parts = line.split(maxsplit=1)
                if len(parts) == 2:
                    k, v = parts[0].lower(), parts[1].strip()
                    if 'quantization' in k:
                        info['quant'] = v
                    elif 'parameters' in k:
                        info['params'] = v

            elif current_section == "Parameters":
                if 'num_ctx' in line.lower():
                    parts = line.split(maxsplit=1)
                    if len(parts) == 2:
                        info['num_ctx'] = parts[1].strip()

        return info
    except subprocess.CalledProcessError:
        return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}


def test_model_vram(model_name: str) -> Dict:
    """
    Test a model's VRAM usage by loading it with a minimal prompt.
    Returns dict with model stats and VRAM usage.
    """
    print(f"Testing {model_name}...", end=' ', flush=True)

    # Get model info first
    info = get_model_info(model_name)

    # Send a minimal test prompt to force model loading
    url = f"{get_ollama_url()}/api/generate"
    prompt_data = {
        "model": model_name,
        "prompt": "Reply with only: OK",
        "stream": False
    }

    try:
        req = urllib.request.Request(
            url,
            data=json.dumps(prompt_data).encode('utf-8'),
            headers={'Content-Type': 'application/json'}
        )

        # Send request and wait for model to load
        with urllib.request.urlopen(req, timeout=30) as response:
            response.read()  # Wait for completion

        # Give it a moment to stabilize
        time.sleep(0.5)

        # Now check /api/ps for VRAM usage
        ps_url = f"{get_ollama_url()}/api/ps"
        with urllib.request.urlopen(ps_url, timeout=5) as r:
            ps_data = json.loads(r.read().decode())
            models = ps_data.get('models', [])

            # Find our model in the running models
            for m in models:
                if m['name'] == model_name or m['name'].startswith(model_name + ':'):
                    size_bytes = m.get('size', 0)
                    size_vram = m.get('size_vram', 0)

                    # Calculate VRAM usage in GB
                    vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
                    total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0

                    # Calculate offload percentage (how much is on CPU)
                    if size_bytes > 0:
                        offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
                    else:
                        offload_pct = 0

                    print("✓")

                    return {
                        'model': model_name,
                        'params': info['params'],
                        'size_gb': total_gb,
                        'quant': info['quant'],
                        'num_ctx': info['num_ctx'],
                        'vram_gb': vram_gb,
                        'offload_pct': offload_pct,
                        'success': True
                    }

        # Model not found in ps output
        print("✗ (not in ps)")
        return {
            'model': model_name,
            'params': info['params'],
            'size_gb': 0,
            'quant': info['quant'],
            'num_ctx': info['num_ctx'],
            'vram_gb': 0,
            'offload_pct': 0,
            'success': False
        }

    except Exception as e:
        print(f"✗ ({str(e)[:30]})")
        return {
            'model': model_name,
            'params': info['params'],
            'size_gb': 0,
            'quant': info['quant'],
            'num_ctx': info['num_ctx'],
            'vram_gb': 0,
            'offload_pct': 0,
            'success': False
        }


def main():
    parser = argparse.ArgumentParser(
        description='Test Ollama models for VRAM usage and CPU offloading',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Test all installed models
  %(prog)s

  # Test a specific model
  %(prog)s ministral-3:3b-instruct-2512-q5_k_m
        """
    )

    parser.add_argument(
        'model',
        nargs='?',
        help='Specific model to test (optional, tests all if omitted)'
    )

    args = parser.parse_args()

    # Check if ollama is available
    try:
        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
        sys.exit(1)

    # Determine which models to test
    if args.model:
        models = [args.model]
    else:
        models = get_installed_models()
        if not models:
            print("✗ No models found")
            sys.exit(1)
        print(f"Found {len(models)} installed model(s)\n")

    # Test each model
    results = []
    for model in models:
        result = test_model_vram(model)
        results.append(result)

    # Display results table
    print("\n" + "="*110)
    print("VRAM USAGE TEST RESULTS")
    print("="*110)

    # Column widths
    w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}

    header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
              f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")

    print(header)
    print("-" * 110)

    for r in results:
        # Truncate long model names
        name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']

        # Format values
        size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
        vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"

        # Offload status
        if r['success']:
            if r['offload_pct'] > 0:
                offload_str = f"{r['offload_pct']:.1f}% CPU"
            else:
                offload_str = "0% (GPU only)"
        else:
            offload_str = "Failed"

        print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
              f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")

    # Summary
    successful = sum(1 for r in results if r['success'])
    with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)

    print("\n" + "="*110)
    print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")

    if with_offload > 0:
        print(f"\n⚠  {with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")


if __name__ == '__main__':
    main()