Files
ollama-utils/scripts/vram-test.py
2026-01-18 22:01:50 +01:00

275 lines
8.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Ollama VRAM Test - Evaluate if models fit in VRAM
Tests models with their configured parameters and reports VRAM usage and CPU offloading.
"""
import argparse
import json
import subprocess
import sys
import time
import urllib.request
from typing import Dict, List, Optional
def get_ollama_url():
"""Get the Ollama API URL."""
return "http://localhost:11434"
def get_installed_models() -> List[str]:
"""Get list of installed Ollama models."""
try:
result = subprocess.run(
['ollama', 'list'],
capture_output=True,
text=True,
check=True
)
models = []
for line in result.stdout.strip().split('\n')[1:]: # Skip header
if line.strip():
name = line.split()[0]
models.append(name)
return models
except subprocess.CalledProcessError:
return []
def get_model_info(model_name: str) -> Dict:
"""Get model information from ollama show."""
try:
result = subprocess.run(
['ollama', 'show', model_name],
capture_output=True,
text=True,
check=True
)
info = {
'size': 'N/A',
'quant': 'N/A',
'num_ctx': 'N/A',
'params': 'N/A'
}
current_section = None
for line in result.stdout.split('\n'):
line = line.strip()
if not line:
continue
if line in ["Model", "Parameters"]:
current_section = line
continue
if current_section == "Model":
parts = line.split(maxsplit=1)
if len(parts) == 2:
k, v = parts[0].lower(), parts[1].strip()
if 'quantization' in k:
info['quant'] = v
elif 'parameters' in k:
info['params'] = v
elif current_section == "Parameters":
if 'num_ctx' in line.lower():
parts = line.split(maxsplit=1)
if len(parts) == 2:
info['num_ctx'] = parts[1].strip()
return info
except subprocess.CalledProcessError:
return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}
def test_model_vram(model_name: str) -> Dict:
"""
Test a model's VRAM usage by loading it with a minimal prompt.
Returns dict with model stats and VRAM usage.
"""
print(f"Testing {model_name}...", end=' ', flush=True)
# Get model info first
info = get_model_info(model_name)
# Send a minimal test prompt to force model loading
url = f"{get_ollama_url()}/api/generate"
prompt_data = {
"model": model_name,
"prompt": "Reply with only: OK",
"stream": False
}
try:
req = urllib.request.Request(
url,
data=json.dumps(prompt_data).encode('utf-8'),
headers={'Content-Type': 'application/json'}
)
# Send request and wait for model to load
with urllib.request.urlopen(req, timeout=30) as response:
response.read() # Wait for completion
# Give it a moment to stabilize
time.sleep(0.5)
# Now check /api/ps for VRAM usage
ps_url = f"{get_ollama_url()}/api/ps"
with urllib.request.urlopen(ps_url, timeout=5) as r:
ps_data = json.loads(r.read().decode())
models = ps_data.get('models', [])
# Find our model in the running models
for m in models:
if m['name'] == model_name or m['name'].startswith(model_name + ':'):
size_bytes = m.get('size', 0)
size_vram = m.get('size_vram', 0)
# Calculate VRAM usage in GB
vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0
# Calculate offload percentage (how much is on CPU)
if size_bytes > 0:
offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
else:
offload_pct = 0
print("")
return {
'model': model_name,
'params': info['params'],
'size_gb': total_gb,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': vram_gb,
'offload_pct': offload_pct,
'success': True
}
# Model not found in ps output
print("✗ (not in ps)")
return {
'model': model_name,
'params': info['params'],
'size_gb': 0,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': 0,
'offload_pct': 0,
'success': False
}
except Exception as e:
print(f"✗ ({str(e)[:30]})")
return {
'model': model_name,
'params': info['params'],
'size_gb': 0,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': 0,
'offload_pct': 0,
'success': False
}
def main():
parser = argparse.ArgumentParser(
description='Test Ollama models for VRAM usage and CPU offloading',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Test all installed models
%(prog)s
# Test a specific model
%(prog)s ministral-3:3b-instruct-2512-q5_k_m
"""
)
parser.add_argument(
'model',
nargs='?',
help='Specific model to test (optional, tests all if omitted)'
)
args = parser.parse_args()
# Check if ollama is available
try:
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
sys.exit(1)
# Determine which models to test
if args.model:
models = [args.model]
else:
models = get_installed_models()
if not models:
print("✗ No models found")
sys.exit(1)
print(f"Found {len(models)} installed model(s)\n")
# Test each model
results = []
for model in models:
result = test_model_vram(model)
results.append(result)
# Display results table
print("\n" + "="*110)
print("VRAM USAGE TEST RESULTS")
print("="*110)
# Column widths
w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}
header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")
print(header)
print("-" * 110)
for r in results:
# Truncate long model names
name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
# Format values
size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"
# Offload status
if r['success']:
if r['offload_pct'] > 0:
offload_str = f"{r['offload_pct']:.1f}% CPU"
else:
offload_str = "0% (GPU only)"
else:
offload_str = "Failed"
print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")
# Summary
successful = sum(1 for r in results if r['success'])
with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)
print("\n" + "="*110)
print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")
if with_offload > 0:
print(f"\n{with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")
if __name__ == '__main__':
main()