initial commit

2026-01-18 22:01:50 +01:00
parent ab25613358
commit c40874d7f0
20 changed files with 6287 additions and 11 deletions
--- a/scripts/vram-test.py
+++ b/scripts/vram-test.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Ollama VRAM Test - Evaluate if models fit in VRAM
+Tests models with their configured parameters and reports VRAM usage and CPU offloading.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+import urllib.request
+from typing import Dict, List, Optional
+
+
+def get_ollama_url():
+    """Get the Ollama API URL."""
+    return "http://localhost:11434"
+
+
+def get_installed_models() -> List[str]:
+    """Get list of installed Ollama models."""
+    try:
+        result = subprocess.run(
+            ['ollama', 'list'],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        models = []
+        for line in result.stdout.strip().split('\n')[1:]:  # Skip header
+            if line.strip():
+                name = line.split()[0]
+                models.append(name)
+        
+        return models
+    except subprocess.CalledProcessError:
+        return []
+
+
+def get_model_info(model_name: str) -> Dict:
+    """Get model information from ollama show."""
+    try:
+        result = subprocess.run(
+            ['ollama', 'show', model_name],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        info = {
+            'size': 'N/A',
+            'quant': 'N/A',
+            'num_ctx': 'N/A',
+            'params': 'N/A'
+        }
+        
+        current_section = None
+        for line in result.stdout.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+                
+            if line in ["Model", "Parameters"]:
+                current_section = line
+                continue
+            
+            if current_section == "Model":
+                parts = line.split(maxsplit=1)
+                if len(parts) == 2:
+                    k, v = parts[0].lower(), parts[1].strip()
+                    if 'quantization' in k:
+                        info['quant'] = v
+                    elif 'parameters' in k:
+                        info['params'] = v
+            
+            elif current_section == "Parameters":
+                if 'num_ctx' in line.lower():
+                    parts = line.split(maxsplit=1)
+                    if len(parts) == 2:
+                        info['num_ctx'] = parts[1].strip()
+        
+        return info
+    except subprocess.CalledProcessError:
+        return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}
+
+
+def test_model_vram(model_name: str) -> Dict:
+    """
+    Test a model's VRAM usage by loading it with a minimal prompt.
+    Returns dict with model stats and VRAM usage.
+    """
+    print(f"Testing {model_name}...", end=' ', flush=True)
+    
+    # Get model info first
+    info = get_model_info(model_name)
+    
+    # Send a minimal test prompt to force model loading
+    url = f"{get_ollama_url()}/api/generate"
+    prompt_data = {
+        "model": model_name,
+        "prompt": "Reply with only: OK",
+        "stream": False
+    }
+    
+    try:
+        req = urllib.request.Request(
+            url,
+            data=json.dumps(prompt_data).encode('utf-8'),
+            headers={'Content-Type': 'application/json'}
+        )
+        
+        # Send request and wait for model to load
+        with urllib.request.urlopen(req, timeout=30) as response:
+            response.read()  # Wait for completion
+        
+        # Give it a moment to stabilize
+        time.sleep(0.5)
+        
+        # Now check /api/ps for VRAM usage
+        ps_url = f"{get_ollama_url()}/api/ps"
+        with urllib.request.urlopen(ps_url, timeout=5) as r:
+            ps_data = json.loads(r.read().decode())
+            models = ps_data.get('models', [])
+            
+            # Find our model in the running models
+            for m in models:
+                if m['name'] == model_name or m['name'].startswith(model_name + ':'):
+                    size_bytes = m.get('size', 0)
+                    size_vram = m.get('size_vram', 0)
+                    
+                    # Calculate VRAM usage in GB
+                    vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
+                    total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0
+                    
+                    # Calculate offload percentage (how much is on CPU)
+                    if size_bytes > 0:
+                        offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
+                    else:
+                        offload_pct = 0
+                    
+                    print("✓")
+                    
+                    return {
+                        'model': model_name,
+                        'params': info['params'],
+                        'size_gb': total_gb,
+                        'quant': info['quant'],
+                        'num_ctx': info['num_ctx'],
+                        'vram_gb': vram_gb,
+                        'offload_pct': offload_pct,
+                        'success': True
+                    }
+        
+        # Model not found in ps output
+        print("✗ (not in ps)")
+        return {
+            'model': model_name,
+            'params': info['params'],
+            'size_gb': 0,
+            'quant': info['quant'],
+            'num_ctx': info['num_ctx'],
+            'vram_gb': 0,
+            'offload_pct': 0,
+            'success': False
+        }
+        
+    except Exception as e:
+        print(f"✗ ({str(e)[:30]})")
+        return {
+            'model': model_name,
+            'params': info['params'],
+            'size_gb': 0,
+            'quant': info['quant'],
+            'num_ctx': info['num_ctx'],
+            'vram_gb': 0,
+            'offload_pct': 0,
+            'success': False
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Test Ollama models for VRAM usage and CPU offloading',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test all installed models
+  %(prog)s
+  
+  # Test a specific model
+  %(prog)s ministral-3:3b-instruct-2512-q5_k_m
+        """
+    )
+    
+    parser.add_argument(
+        'model',
+        nargs='?',
+        help='Specific model to test (optional, tests all if omitted)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Check if ollama is available
+    try:
+        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
+        sys.exit(1)
+    
+    # Determine which models to test
+    if args.model:
+        models = [args.model]
+    else:
+        models = get_installed_models()
+        if not models:
+            print("✗ No models found")
+            sys.exit(1)
+        print(f"Found {len(models)} installed model(s)\n")
+    
+    # Test each model
+    results = []
+    for model in models:
+        result = test_model_vram(model)
+        results.append(result)
+    
+    # Display results table
+    print("\n" + "="*110)
+    print("VRAM USAGE TEST RESULTS")
+    print("="*110)
+    
+    # Column widths
+    w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}
+    
+    header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
+              f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")
+    
+    print(header)
+    print("-" * 110)
+    
+    for r in results:
+        # Truncate long model names
+        name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
+        
+        # Format values
+        size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
+        vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"
+        
+        # Offload status
+        if r['success']:
+            if r['offload_pct'] > 0:
+                offload_str = f"{r['offload_pct']:.1f}% CPU"
+            else:
+                offload_str = "0% (GPU only)"
+        else:
+            offload_str = "Failed"
+        
+        print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
+              f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")
+    
+    # Summary
+    successful = sum(1 for r in results if r['success'])
+    with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)
+    
+    print("\n" + "="*110)
+    print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")
+    
+    if with_offload > 0:
+        print(f"\n⚠  {with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")
+
+
+if __name__ == '__main__':
+    main()