initial commit

This commit is contained in:
2026-01-18 22:01:50 +01:00
parent ab25613358
commit c40874d7f0
20 changed files with 6287 additions and 11 deletions

557
scripts/context-optimizer.py Executable file
View File

@@ -0,0 +1,557 @@
#!/usr/bin/env python3
"""
Ollama Context Optimizer - Find optimal num_ctx for models based on VRAM
Iteratively tests different context sizes to recommend the best setting.
"""
import argparse
import json
import subprocess
import sys
import time
import urllib.request
import urllib.error
from typing import Dict, Optional, Tuple
def get_ollama_url():
"""Get the Ollama API URL."""
return "http://localhost:11434"
def get_gpu_vram() -> Tuple[Optional[float], Optional[float]]:
"""Get GPU VRAM total and available in GB."""
import os
device_paths = [
"/sys/class/drm/card1/device/",
"/sys/class/drm/card0/device/",
]
for base_path in device_paths:
if not os.path.exists(base_path):
continue
try:
with open(base_path + "mem_info_vram_used", "r") as f:
used = int(f.read().strip()) / 1024 / 1024 / 1024 # GB
with open(base_path + "mem_info_vram_total", "r") as f:
total = int(f.read().strip()) / 1024 / 1024 / 1024 # GB
available = total - used
return total, available
except:
continue
return None, None
def get_model_info(model_name: str) -> Dict:
"""Get model information including max context capability."""
try:
result = subprocess.run(
['ollama', 'show', model_name],
capture_output=True,
text=True,
check=True
)
info = {
'max_context': 0,
'current_num_ctx': 0,
'params': 'N/A',
'quant': 'N/A'
}
current_section = None
for line in result.stdout.split('\n'):
line = line.strip()
if not line:
continue
if line in ["Model", "Parameters"]:
current_section = line
continue
if current_section == "Model":
parts = line.split(maxsplit=1)
if len(parts) == 2:
k, v = parts[0].lower(), parts[1].strip()
if 'context' in k and 'length' in k:
if v.isdigit():
info['max_context'] = int(v)
elif 'context' in k:
# Handle "context length" as two words
parts2 = line.split()
if len(parts2) >= 3 and parts2[-1].isdigit():
info['max_context'] = int(parts2[-1])
elif 'quantization' in k:
info['quant'] = v
elif 'parameters' in k:
info['params'] = v
elif current_section == "Parameters":
if 'num_ctx' in line.lower():
parts = line.split(maxsplit=1)
if len(parts) == 2 and parts[1].strip().isdigit():
info['current_num_ctx'] = int(parts[1].strip())
return info
except subprocess.CalledProcessError:
return {'max_context': 0, 'current_num_ctx': 0, 'params': 'N/A', 'quant': 'N/A'}
def test_context_size(model_name: str, num_ctx: int) -> Optional[Dict]:
"""
Test a model with a specific context size.
Returns VRAM usage and offload info, or None if failed.
"""
url = f"{get_ollama_url()}/api/generate"
prompt_data = {
"model": model_name,
"prompt": "Reply with only: OK",
"stream": False,
"options": {
"num_ctx": num_ctx
}
}
try:
req = urllib.request.Request(
url,
data=json.dumps(prompt_data).encode('utf-8'),
headers={'Content-Type': 'application/json'}
)
# Send request with longer timeout for large contexts
# Large contexts can take time to allocate
timeout = 60 if num_ctx > 100000 else 30
with urllib.request.urlopen(req, timeout=timeout) as response:
response_data = response.read().decode()
# Check if response contains error
try:
resp_json = json.loads(response_data)
if 'error' in resp_json:
error_msg = resp_json['error']
# Return special dict to indicate OOM or other errors
return {
'vram_gb': 0,
'total_gb': 0,
'offload_pct': 0,
'num_ctx': num_ctx,
'error': error_msg
}
except:
pass
# Wait for model to stabilize
time.sleep(0.5)
# Check /api/ps for VRAM usage
ps_url = f"{get_ollama_url()}/api/ps"
with urllib.request.urlopen(ps_url, timeout=5) as r:
ps_data = json.loads(r.read().decode())
models = ps_data.get('models', [])
for m in models:
if m['name'] == model_name or m['name'].startswith(model_name + ':'):
size_bytes = m.get('size', 0)
size_vram = m.get('size_vram', 0)
vram_gb = size_vram / (1024**3)
total_gb = size_bytes / (1024**3)
offload_pct = 0
if size_bytes > 0:
offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
return {
'vram_gb': vram_gb,
'total_gb': total_gb,
'offload_pct': offload_pct,
'num_ctx': num_ctx
}
return None
except urllib.error.HTTPError as e:
# HTTP errors (500, etc.) - often indicates OOM or model loading failure
try:
error_body = e.read().decode()
error_data = json.loads(error_body)
error_msg = error_data.get('error', str(e))
except:
error_msg = f"HTTP {e.code}"
return {
'vram_gb': 0,
'total_gb': 0,
'offload_pct': 0,
'num_ctx': num_ctx,
'error': error_msg
}
except urllib.error.URLError as e:
# Network/timeout errors
if 'timed out' in str(e).lower():
error_msg = "Timeout (loading too slow)"
else:
error_msg = f"Connection error: {e}"
return {
'vram_gb': 0,
'total_gb': 0,
'offload_pct': 0,
'num_ctx': num_ctx,
'error': error_msg
}
except Exception as e:
# Other unexpected errors
return {
'vram_gb': 0,
'total_gb': 0,
'offload_pct': 0,
'num_ctx': num_ctx,
'error': str(e)[:50]
}
def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: float) -> Dict:
"""
Find optimal context size through intelligent testing.
Uses VRAM measurements to extrapolate optimal size.
Args:
model_name: Name of the Ollama model to test
max_turns: Maximum iterations (None = optimize until convergence)
overhead_gb: VRAM to keep free for system overhead
"""
print(f"Analyzing model: {model_name}")
print("-" * 70)
# Get model capabilities
info = get_model_info(model_name)
max_context = info['max_context']
current_ctx = info['current_num_ctx']
print(f"Model: {model_name}")
print(f"Parameters: {info['params']} ({info['quant']})")
print(f"Max context capability: {max_context:,}")
print(f"Current num_ctx: {current_ctx:,}")
if max_context == 0:
print("\n✗ Could not determine model's max context capability")
return {}
# Get VRAM info
vram_total, vram_available = get_gpu_vram()
if vram_total:
print(f"GPU VRAM: {vram_available:.1f} GB available / {vram_total:.1f} GB total")
print(f"Overhead reserved: {overhead_gb:.1f} GB")
# Reserve specified overhead
target_vram = vram_total - overhead_gb
else:
print("⚠ Could not detect GPU VRAM (testing will continue)")
target_vram = None
if max_turns:
print(f"Testing with max {max_turns} iterations...")
else:
print(f"Testing until convergence (num_ctx must be multiple of 2048)...")
print()
results = []
# Turn 1: Test current setting to establish baseline
test_ctx = current_ctx if current_ctx > 0 else 8192
turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
result = test_context_size(model_name, test_ctx)
if result and 'error' not in result:
results.append(result)
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
baseline_vram = result['vram_gb']
baseline_ctx = test_ctx
else:
print("✗ Failed")
return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info}
# Turn 2: Test a higher context to calculate VRAM/context ratio
# Try doubling the context or 32K, whichever is smaller
test_ctx_2 = min(baseline_ctx * 2, 32768, max_context)
if test_ctx_2 <= baseline_ctx:
test_ctx_2 = min(baseline_ctx + 16384, max_context)
# Round to multiple of 2048
test_ctx_2 = (test_ctx_2 // 2048) * 2048
turn_label = f"Turn 2/{max_turns}" if max_turns else "Turn 2"
print(f"{turn_label}: Testing num_ctx={test_ctx_2:,} (calibration)...", end=' ', flush=True)
result = test_context_size(model_name, test_ctx_2)
if result and 'error' not in result:
results.append(result)
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
# Calculate VRAM per 1K context tokens
vram_diff = result['vram_gb'] - baseline_vram
ctx_diff = test_ctx_2 - baseline_ctx
if ctx_diff > 0:
vram_per_1k_ctx = (vram_diff / ctx_diff) * 1000
print(f" → Estimated VRAM usage: {vram_per_1k_ctx:.4f} GB per 1K context")
# Predict optimal context size
if target_vram and vram_per_1k_ctx > 0:
available_for_ctx = target_vram - baseline_vram
estimated_additional_ctx = (available_for_ctx / vram_per_1k_ctx) * 1000
predicted_optimal = baseline_ctx + int(estimated_additional_ctx)
# Round to multiple of 2048
predicted_optimal = (predicted_optimal // 2048) * 2048
predicted_optimal = max(baseline_ctx, min(predicted_optimal, max_context))
print(f" → Predicted optimal context: {predicted_optimal:,}")
else:
predicted_optimal = None
vram_per_1k_ctx = None
else:
vram_per_1k_ctx = None
predicted_optimal = None
else:
if result and 'error' in result:
error_msg = result['error']
if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
print(f"✗ OOM (out of memory)")
else:
print(f"✗ Error: {error_msg[:30]}")
else:
print("✗ Failed")
vram_per_1k_ctx = None
predicted_optimal = None
# Remaining turns: Test predicted optimal or use VRAM-based refinement
min_ctx = baseline_ctx
max_ctx = max_context
turn = 2
while True:
# Check if we should stop
if max_turns and turn >= max_turns:
break
if predicted_optimal and turn == 2:
# Turn 3: Test predicted optimal
test_ctx = predicted_optimal
turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
print(f"{turn_label}: Testing num_ctx={test_ctx:,} (predicted optimal)...", end=' ', flush=True)
else:
# Use VRAM-based prediction if we have the data
if vram_per_1k_ctx and target_vram and len(results) > 0:
# Find the last successful result (no offload)
last_good = None
for r in reversed(results):
if r['offload_pct'] == 0:
last_good = r
break
if last_good and target_vram:
# Calculate how much more context we can realistically add
available_vram = target_vram - last_good['vram_gb']
# Calculate potential additional context
additional_ctx = (available_vram / vram_per_1k_ctx) * 1000
# If we can only add < 8K context, do small increments
if additional_ctx < 8192:
# Small increments - round up to next 2048 boundary
test_ctx = last_good['num_ctx'] + 2048
test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
else:
# Larger headroom - use 60% of predicted to be conservative
test_ctx = last_good['num_ctx'] + int(additional_ctx * 0.6)
test_ctx = (test_ctx // 2048) * 2048
test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
else:
# No good result yet - binary search
test_ctx = (min_ctx + max_ctx) // 2
test_ctx = (test_ctx // 2048) * 2048
else:
# No VRAM data - fall back to binary search
test_ctx = (min_ctx + max_ctx) // 2
test_ctx = (test_ctx // 2048) * 2048
# Avoid retesting same value
if any(r['num_ctx'] == test_ctx for r in results):
# Adjust by 2048
if test_ctx < max_ctx:
test_ctx += 2048
else:
test_ctx -= 2048
if test_ctx <= min_ctx or test_ctx >= max_ctx:
print(f"\nConverged after {turn + 1} turns")
break
turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
print(f"{turn_label}: Testing num_ctx={test_ctx:,}...", end=' ', flush=True)
result = test_context_size(model_name, test_ctx)
if result is None:
print("✗ Failed (model not found)")
max_ctx = test_ctx
continue
if 'error' in result:
error_msg = result['error']
if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
print(f"✗ OOM (out of memory)")
elif 'timeout' in error_msg.lower():
print(f"✗ Timeout")
else:
print(f"✗ Error: {error_msg[:30]}")
max_ctx = test_ctx
continue
results.append(result)
offload_str = f"{result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else "GPU only"
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {offload_str}")
# Adjust search bounds
if result['offload_pct'] > 0:
max_ctx = test_ctx
else:
min_ctx = test_ctx
# Stop if we're converging (within one step of 2048)
if max_ctx - min_ctx <= 2048:
print(f"\nConverged after {turn + 1} turns")
break
turn += 1
return {
'model': model_name,
'results': results,
'max_context': max_context,
'current_ctx': current_ctx,
'vram_total': vram_total,
'info': info
}
def print_recommendation(analysis: Dict):
"""Print optimization recommendations."""
if not analysis or not analysis.get('results'):
print("\n✗ No results to analyze")
return
results = analysis['results']
max_context = analysis['max_context']
current_ctx = analysis['current_ctx']
print("\n" + "="*70)
print("OPTIMIZATION RECOMMENDATION")
print("="*70)
# Find best context without offloading
no_offload = [r for r in results if r['offload_pct'] == 0]
if no_offload:
# Recommend highest context without offloading
best = max(no_offload, key=lambda x: x['num_ctx'])
print(f"\n✓ Recommended num_ctx: {best['num_ctx']:,}")
print(f" VRAM usage: {best['vram_gb']:.2f} GB")
print(f" Status: Fits entirely in GPU memory")
if best['num_ctx'] < max_context:
print(f"\n⚠ Note: Model supports up to {max_context:,} context")
print(f" but VRAM limits optimal usage to {best['num_ctx']:,}")
if current_ctx != best['num_ctx']:
print(f"\n📝 Suggested Modelfile change:")
print(f" Current: PARAMETER num_ctx {current_ctx}")
print(f" Optimal: PARAMETER num_ctx {best['num_ctx']}")
else:
# All tests had offloading
print("\n⚠ All tested configurations require CPU offloading")
# Find least offloading
least_offload = min(results, key=lambda x: x['offload_pct'])
print(f"\n Least offloading at num_ctx={least_offload['num_ctx']:,}")
print(f" CPU offload: {least_offload['offload_pct']:.1f}%")
print(f" VRAM usage: {least_offload['vram_gb']:.2f} GB")
print(f"\n💡 Recommendations:")
print(f" 1. Use lower quantization (Q4 instead of Q5/Q8)")
print(f" 2. Reduce num_ctx to {least_offload['num_ctx']:,} or lower")
print(f" 3. Consider a smaller model variant")
# VRAM efficiency
print(f"\n📊 Tested context sizes:")
for r in sorted(results, key=lambda x: x['num_ctx']):
status = "" if r['offload_pct'] == 0 else ""
print(f" {status} {r['num_ctx']:>6,}: {r['vram_gb']:>5.2f} GB VRAM, "
f"{r['offload_pct']:>4.1f}% CPU offload")
def main():
parser = argparse.ArgumentParser(
description='Optimize Ollama model context size for VRAM constraints',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Optimize with 5 test iterations (default)
%(prog)s ministral-3:3b-instruct-2512-q5_k_m
# Use 10 iterations for more precise optimization
%(prog)s ministral-3:3b-instruct-2512-q5_k_m --turns 10
"""
)
parser.add_argument(
'model',
help='Model name to optimize'
)
parser.add_argument(
'--turns',
type=int,
default=None,
help='Maximum number of test iterations (default: optimize until convergence)'
)
parser.add_argument(
'--overhead',
type=float,
default=1.0,
help='VRAM overhead to keep free in GB (default: 1.0)'
)
args = parser.parse_args()
if args.turns is not None and args.turns < 2:
print("✗ Error: --turns must be at least 2")
sys.exit(1)
# Check if ollama is available
try:
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
sys.exit(1)
# Run optimization
analysis = find_optimal_context(args.model, args.turns, args.overhead)
# Print recommendations
print_recommendation(analysis)
if __name__ == '__main__':
main()

638
scripts/hf-llm-install.py Executable file
View File

@@ -0,0 +1,638 @@
#!/usr/bin/env python3
"""
HuggingFace LLM Installer for Ollama
Automatically downloads GGUF files from HuggingFace and creates Ollama models.
Features:
- SHA256 checksum verification
- Disk space checking
- Dry run mode
- Parallel processing
- Skip existing models
"""
import argparse
import hashlib
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from urllib.parse import urlparse
import urllib.request
def parse_model_name_from_gguf(gguf_filename):
"""
Parse model name and tag from GGUF filename.
Args:
gguf_filename: Name of the GGUF file
Returns:
Tuple of (model_base, tag, full_name) or (filename, 'latest', filename) if parsing fails
"""
filename_stem = Path(gguf_filename).stem.lower()
# Split on hyphens
parts = filename_stem.split('-')
if len(parts) >= 3:
# Find where the size variant starts (e.g., "0.5b", "3b", "8b", "14b")
base_parts = []
tag_parts = []
found_variant = False
for part in parts:
# Check if this looks like a size variant (e.g., "3b", "8b", "0.5b")
if not found_variant and re.match(r'^\d+(\.\d+)?b$', part):
found_variant = True
tag_parts.append(part)
elif found_variant:
# Include everything after the variant (including quantization)
tag_parts.append(part)
else:
# Before the variant = base name
base_parts.append(part)
if base_parts and tag_parts:
model_base = '-'.join(base_parts)
model_tag = '-'.join(tag_parts)
full_name = f"{model_base}:{model_tag}"
return (model_base, model_tag, full_name)
# Fallback to filename without extension
return (filename_stem, 'latest', filename_stem)
def parse_modelfile(modelfile_path):
"""
Parse a Modelfile to extract HuggingFace upstream URL and model info.
Args:
modelfile_path: Path to the .Modelfile
Returns:
dict with model metadata or None if invalid
"""
with open(modelfile_path, 'r') as f:
content = f.read()
# Look for hf_upstream in the header comments
hf_match = re.search(r'#\s*hf_upstream:\s*(https://huggingface\.co/[^\s]+)', content)
if not hf_match:
return None
hf_url = hf_match.group(1)
# Look for optional quantization specification (default: q4_k_m)
quant_match = re.search(r'#\s*quantization:\s*([a-zA-Z0-9_]+)', content)
quantization = quant_match.group(1).upper() if quant_match else 'Q4_K_M'
# Look for optional SHA256 checksum
sha256_match = re.search(r'#\s*sha256:\s*([a-fA-F0-9]{64})', content)
sha256 = sha256_match.group(1) if sha256_match else None
# Look for optional capabilities (comma-separated list)
# Format: # capabilities: tools, vision
capabilities_match = re.search(r'#\s*capabilities:\s*([^\n]+)', content)
capabilities = None
if capabilities_match:
# Parse comma-separated capabilities and clean whitespace
caps_str = capabilities_match.group(1).strip()
capabilities = [cap.strip() for cap in caps_str.split(',') if cap.strip()]
# Check if URL points to a specific GGUF file or just the repo
if hf_url.endswith('.gguf') or '/blob/' in hf_url or '/resolve/' in hf_url:
# Specific file provided - use as-is
resolve_url = hf_url.replace('/blob/', '/resolve/')
gguf_filename = os.path.basename(urlparse(resolve_url).path)
else:
# Repository root provided - construct filename from repo name and quantization
# URL format: https://huggingface.co/{org}/{repo}
url_parts = urlparse(hf_url).path.strip('/').split('/')
if len(url_parts) >= 2:
repo_name = url_parts[1] # e.g., "Ministral-3-3B-Instruct-2512-GGUF"
# Remove -GGUF suffix if present (case-insensitive)
if repo_name.upper().endswith('-GGUF'):
repo_name = repo_name[:-5]
# Construct filename: RepoName-Quantization.gguf
gguf_filename = f"{repo_name}-{quantization}.gguf"
resolve_url = f"{hf_url.rstrip('/')}/resolve/main/{gguf_filename}"
else:
print(f"✗ Invalid HuggingFace URL format: {hf_url}")
return None
# Extract model name and tag from the GGUF filename
# Format: Model-Version-Variant-Year-Quant.gguf -> model:version-variant-year-quant
# Example: Ministral-3-3B-Instruct-2512-Q5_K_M.gguf -> ministral-3:3b-instruct-2512-q5_k_m
model_base, model_tag, model_name = parse_model_name_from_gguf(gguf_filename)
return {
'hf_url': hf_url,
'resolve_url': resolve_url,
'gguf_filename': gguf_filename,
'model_name': model_name,
'modelfile_path': modelfile_path,
'sha256': sha256,
'capabilities': capabilities
}
def get_file_size(url):
"""
Get the size of a file from URL without downloading it.
Args:
url: File URL
Returns:
Size in bytes or None if unavailable
"""
try:
req = urllib.request.Request(url, method='HEAD')
with urllib.request.urlopen(req, timeout=10) as response:
size = response.headers.get('Content-Length')
return int(size) if size else None
except Exception:
return None
def check_disk_space(required_bytes, path='.'):
"""
Check if there's enough disk space available.
Args:
required_bytes: Required space in bytes
path: Path to check space on (default: current directory)
Returns:
Tuple of (has_space, available_bytes, required_bytes)
"""
# Get absolute path to check actual filesystem
abs_path = os.path.abspath(path)
stat = shutil.disk_usage(abs_path)
# Add 10% safety margin
required_with_margin = int(required_bytes * 1.1)
return (stat.free >= required_with_margin, stat.free, required_with_margin)
def calculate_sha256(filepath, chunk_size=8192):
"""
Calculate SHA256 checksum of a file.
Args:
filepath: Path to file
chunk_size: Bytes to read at once
Returns:
SHA256 hex digest
"""
sha256_hash = hashlib.sha256()
with open(filepath, 'rb') as f:
for chunk in iter(lambda: f.read(chunk_size), b''):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
def verify_checksum(filepath, expected_sha256):
"""
Verify file checksum matches expected value.
Args:
filepath: Path to file
expected_sha256: Expected SHA256 hash
Returns:
True if match, False otherwise
"""
print(f" Verifying checksum...")
actual = calculate_sha256(filepath)
if actual.lower() == expected_sha256.lower():
print(f" ✓ Checksum verified: {actual[:16]}...")
return True
else:
print(f" ✗ Checksum mismatch!")
print(f" Expected: {expected_sha256}")
print(f" Actual: {actual}")
return False
def get_existing_models():
"""
Get list of existing Ollama models.
Returns:
Set of model names
"""
try:
result = subprocess.run(
['ollama', 'list'],
capture_output=True,
text=True,
check=True
)
# Parse output to get model names
# Format: NAME ID SIZE MODIFIED
models = set()
for line in result.stdout.strip().split('\n')[1:]: # Skip header
if line.strip():
# Get first column (name)
name = line.split()[0]
# Remove tag if present
base_name = name.split(':')[0]
models.add(base_name)
return models
except (subprocess.CalledProcessError, FileNotFoundError):
return set()
def download_file(url, dest_path, filename, should_cancel=None, progress_callback=None):
"""
Download a file from URL to destination with progress indication.
Args:
url: Source URL
dest_path: Destination file path
filename: Name for display purposes
should_cancel: Optional callback function that returns True if download should be cancelled
progress_callback: Optional callback function to report progress messages
"""
def log(msg):
"""Helper to print and optionally call progress callback."""
print(msg)
if progress_callback:
progress_callback(msg)
log(f"Downloading {filename}...")
log(f" From: {url}")
log(f" To: {dest_path}")
def show_progress(block_num, block_size, total_size):
# Check for cancellation
if should_cancel and should_cancel():
raise InterruptedError("Download cancelled")
downloaded = block_num * block_size
if total_size > 0:
percent = min(100, downloaded * 100 / total_size)
mb_downloaded = downloaded / (1024 * 1024)
mb_total = total_size / (1024 * 1024)
msg = f"\r Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)"
print(msg, end='')
if progress_callback:
progress_callback(f"Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)")
try:
urllib.request.urlretrieve(url, dest_path, show_progress)
print() # New line after progress
log(f"✓ Download complete")
except Exception as e:
print(f"\n✗ Download failed: {e}")
if progress_callback:
progress_callback(f"✗ Download failed: {e}")
raise
def create_ollama_model(modelfile_path, gguf_path, model_name, capabilities=None):
"""
Create an Ollama model from the Modelfile and GGUF file.
Args:
modelfile_path: Path to the .Modelfile
gguf_path: Path to the downloaded GGUF file
model_name: Name for the Ollama model
capabilities: Optional list of capabilities to add (e.g., ['tools', 'vision'])
"""
print(f"\nCreating Ollama model: {model_name}")
# Note: Capabilities are detected from the GGUF file metadata by Ollama automatically
if capabilities:
print(f" Expected capabilities from GGUF metadata: {', '.join(capabilities)}")
# Read the Modelfile and update the FROM path to point to the downloaded GGUF
with open(modelfile_path, 'r') as f:
modelfile_content = f.read()
# Replace the FROM line to use the actual GGUF path
# Handle both relative paths like "./filename.gguf" and URLs like "https://..."
original_content = modelfile_content
modelfile_content = re.sub(
r'FROM\s+(?:\./[^\s]+\.gguf|https?://[^\n]+)',
f'FROM {gguf_path}',
modelfile_content
)
# Debug: check if replacement happened
if original_content == modelfile_content:
print(f" WARNING: FROM line was not replaced!")
print(f" Looking for pattern in: {original_content[:200]}")
else:
print(f" ✓ Replaced FROM line with local path: {gguf_path}")
# Create a temporary Modelfile with the correct path
with tempfile.NamedTemporaryFile(mode='w', suffix='.Modelfile', delete=False) as tmp_modelfile:
tmp_modelfile.write(modelfile_content)
tmp_modelfile_path = tmp_modelfile.name
try:
# Run ollama create
cmd = ['ollama', 'create', model_name, '-f', tmp_modelfile_path]
print(f" Running: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True
)
if result.returncode == 0:
print(f"✓ Model '{model_name}' created successfully")
if result.stdout:
print(f" {result.stdout.strip()}")
else:
print(f"✗ Failed to create model")
if result.stderr:
print(f" Error: {result.stderr.strip()}")
raise subprocess.CalledProcessError(result.returncode, cmd)
finally:
# Clean up temporary Modelfile
os.unlink(tmp_modelfile_path)
def install_model(modelfile_path, dry_run=False, skip_existing=False, existing_models=None, should_cancel=None, progress_callback=None):
"""
Install a single model from a Modelfile.
Args:
modelfile_path: Path to the .Modelfile
dry_run: If True, only simulate installation
skip_existing: If True, skip models already in Ollama
existing_models: Set of existing model names
should_cancel: Optional callback function that returns True if installation should be cancelled
progress_callback: Optional callback function to report progress messages
Returns:
Tuple of (success: bool, skipped: bool, model_name: str)
"""
def log(msg):
"""Helper to print and optionally call progress callback."""
print(msg)
if progress_callback:
progress_callback(msg)
log(f"\n{'='*80}")
log(f"Processing: {modelfile_path}")
log(f"{'='*80}")
# Parse the Modelfile
model_info = parse_modelfile(modelfile_path)
if not model_info:
log(f"✗ No hf_upstream found in {modelfile_path}")
return (False, False, None)
log(f"Model name: {model_info['model_name']}")
log(f"GGUF file: {model_info['gguf_filename']}")
if model_info['sha256']:
log(f"SHA256: {model_info['sha256'][:16]}...")
if model_info.get('capabilities'):
log(f"Capabilities: {', '.join(model_info['capabilities'])}")
# Check if model already exists
if skip_existing and existing_models and model_info['model_name'] in existing_models:
log(f"⊘ Model '{model_info['model_name']}' already exists, skipping")
return (True, True, model_info['model_name'])
# Get file size and check disk space
file_size = get_file_size(model_info['resolve_url'])
if file_size:
size_gb = file_size / (1024**3)
log(f"File size: {size_gb:.2f} GB")
if not dry_run:
has_space, available, required = check_disk_space(file_size)
if not has_space:
log(f"✗ Insufficient disk space!")
log(f" Required: {required / (1024**3):.2f} GB (with 10% margin)")
log(f" Available: {available / (1024**3):.2f} GB")
return (False, False, model_info['model_name'])
else:
log(f"✓ Disk space check passed ({available / (1024**3):.2f} GB available)")
if dry_run:
log(f"\n[DRY RUN] Would download and install model: {model_info['model_name']}")
return (True, False, model_info['model_name'])
# Create temporary directory for download
with tempfile.TemporaryDirectory() as tmp_dir:
gguf_path = os.path.join(tmp_dir, model_info['gguf_filename'])
try:
# Download the GGUF file
download_file(model_info['resolve_url'], gguf_path, model_info['gguf_filename'], should_cancel, progress_callback)
# Verify checksum if provided
if model_info['sha256']:
if not verify_checksum(gguf_path, model_info['sha256']):
print(f"✗ Checksum verification failed!")
return (False, False, model_info['model_name'])
# Create the Ollama model
create_ollama_model(
modelfile_path,
gguf_path,
model_info['model_name'],
model_info.get('capabilities')
)
print(f"\n✓ Successfully installed model: {model_info['model_name']}")
return (True, False, model_info['model_name'])
except Exception as e:
print(f"\n✗ Failed to install model: {e}")
return (False, False, model_info['model_name'])
def install_model_wrapper(args):
"""Wrapper for parallel execution."""
return install_model(*args)
def main():
parser = argparse.ArgumentParser(
description='Install Ollama models from HuggingFace using Modelfiles',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Install a single model
%(prog)s path/to/model.Modelfile
# Install all models in the default repo directory
%(prog)s
# Dry run to see what would be installed
%(prog)s --dry-run
# Skip models that already exist
%(prog)s --skip-existing
# Install with 3 parallel downloads
%(prog)s --parallel 3
"""
)
parser.add_argument(
'modelfile',
nargs='?',
help='Path to a specific .Modelfile to install (optional)'
)
parser.add_argument(
'--dir',
default='./modelfile-repo',
help='Directory containing .Modelfile files (default: ./modelfile-repo)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Simulate installation without downloading or creating models'
)
parser.add_argument(
'--skip-existing',
action='store_true',
help='Skip models that already exist in Ollama'
)
parser.add_argument(
'--parallel',
type=int,
default=1,
metavar='N',
help='Number of parallel downloads/installations (default: 1)'
)
args = parser.parse_args()
# Validate parallel argument
if args.parallel < 1:
print("✗ Error: --parallel must be at least 1")
sys.exit(1)
# Check if ollama is available
try:
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
print(" Visit: https://ollama.ai")
sys.exit(1)
# Get existing models if skip_existing is enabled
existing_models = None
if args.skip_existing:
existing_models = get_existing_models()
if existing_models:
print(f"Found {len(existing_models)} existing model(s)")
# Determine which Modelfiles to process
if args.modelfile:
# Single file mode
modelfile_path = Path(args.modelfile)
if not modelfile_path.exists():
print(f"✗ Error: File not found: {modelfile_path}")
sys.exit(1)
if not modelfile_path.suffix == '.Modelfile':
print(f"✗ Error: File must have .Modelfile extension")
sys.exit(1)
modelfiles = [modelfile_path]
else:
# Batch mode - process all .Modelfile files in directory
modelfile_dir = Path(args.dir)
if not modelfile_dir.exists():
print(f"✗ Error: Directory not found: {modelfile_dir}")
sys.exit(1)
modelfiles = sorted(modelfile_dir.glob('*.Modelfile'))
if not modelfiles:
print(f"✗ No .Modelfile files found in {modelfile_dir}")
sys.exit(1)
print(f"Found {len(modelfiles)} Modelfile(s) to process")
if args.dry_run:
print("\n*** DRY RUN MODE - No files will be downloaded or models created ***\n")
# Process all Modelfiles
results = []
if args.parallel > 1 and len(modelfiles) > 1:
# Parallel processing
print(f"\nUsing {args.parallel} parallel worker(s)")
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
# Submit all tasks
future_to_modelfile = {
executor.submit(
install_model_wrapper,
(modelfile, args.dry_run, args.skip_existing, existing_models)
): modelfile
for modelfile in modelfiles
}
# Collect results as they complete
for future in as_completed(future_to_modelfile):
modelfile = future_to_modelfile[future]
try:
success, skipped, model_name = future.result()
results.append((modelfile.name, success, skipped))
except Exception as e:
print(f"\n✗ Exception processing {modelfile.name}: {e}")
results.append((modelfile.name, False, False))
else:
# Sequential processing
for modelfile in modelfiles:
success, skipped, model_name = install_model(
modelfile,
args.dry_run,
args.skip_existing,
existing_models
)
results.append((modelfile.name, success, skipped))
# Summary
print(f"\n{'='*80}")
print("INSTALLATION SUMMARY")
print(f"{'='*80}")
successful = sum(1 for _, success, skipped in results if success and not skipped)
skipped = sum(1 for _, success, skip in results if skip)
failed = len(results) - successful - skipped
for name, success, skip in results:
if skip:
status = ""
elif success:
status = ""
else:
status = ""
print(f"{status} {name}")
print(f"\nTotal: {len(results)} | Successful: {successful} | Skipped: {skipped} | Failed: {failed}")
if failed > 0:
sys.exit(1)
if __name__ == '__main__':
main()

184
scripts/model-info.py Normal file
View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
Ollama Model Inventory
- Parses the official 'Capabilities' section from ollama show
- Accurate VRAM estimation
"""
import subprocess
import re
from typing import Dict, List
def get_cmd_output(cmd: List[str]) -> str:
try:
# Run command and get stdout
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return result.stdout.strip()
except subprocess.CalledProcessError:
return ""
def parse_parameters(param_str: str) -> float:
"""Parses '8.0B' or '307M' into standard Billions (float)"""
if not param_str or param_str == "N/A": return 0.0
clean_val = re.sub(r"[^0-9.]", "", param_str)
try:
val = float(clean_val)
if "M" in param_str.upper(): return val / 1000.0
return val
except ValueError: return 0.0
def estimate_vram(params_billions: float, quant: str, context: int, context_used: int) -> str:
"""Estimates VRAM usage (Model Weights + Typical KV Cache)."""
if params_billions == 0.0: return "N/A"
# 1. Weights Size (bits per parameter)
q_up = quant.upper()
if "MXFP4" in q_up or "FP4" in q_up: bpp = 0.55
elif "Q8" in q_up: bpp = 1.0
elif "Q6" in q_up: bpp = 0.85
elif "Q5" in q_up: bpp = 0.75
elif "Q4" in q_up: bpp = 0.65
elif "Q3" in q_up: bpp = 0.55
elif "Q2" in q_up: bpp = 0.45
elif "IQ" in q_up: bpp = 0.35 # IQ quantization
elif "F16" in q_up or "BF16" in q_up: bpp = 2.0
elif "F32" in q_up: bpp = 4.0
else: bpp = 0.65 # Default Q4_K_M
weight_gb = params_billions * bpp
# 2. KV Cache Size
# More accurate formula: context_tokens * embedding_dim * layers * 2 (K+V) * bytes_per_value / 1e9
# Simplified: For a typical LLM, ~0.002 GB per 1000 tokens at FP16
# Use actual context_used if available, otherwise use a reasonable default (8K)
effective_context = context_used if context_used > 0 else min(context, 8192)
kv_cache_gb = (effective_context / 1000) * 0.002
# 3. System Overhead (Ollama runtime, etc.)
overhead_gb = 0.3
total_gb = weight_gb + kv_cache_gb + overhead_gb
if total_gb < 1: return f"{total_gb * 1024:.0f} MB"
return f"{total_gb:.1f} GB"
def get_model_info(name: str, disk_size: str) -> Dict:
try:
raw_show = get_cmd_output(['ollama', 'show', name])
except Exception as e:
return {
'model': name,
'disk': disk_size,
'family': 'ERROR',
'params_str': 'N/A',
'quant': 'N/A',
'context': 0,
'context_used': 0,
'caps': [],
'vram': 'N/A'
}
info = {
'model': name,
'disk': disk_size,
'family': 'N/A',
'params_str': 'N/A',
'quant': 'N/A',
'context': 0,
'context_used': 0, # Actual context from Parameters section
'caps': []
}
# -- State Machine Parsing --
current_section = None
lines = raw_show.split('\n')
for line in lines:
line = line.strip()
if not line: continue
# Detect Sections
if line in ["Model", "Capabilities", "Parameters", "System", "License"]:
current_section = line
continue
# Parse 'Model' Section
if current_section == "Model":
parts = line.split(maxsplit=1)
if len(parts) == 2:
k, v = parts[0].lower(), parts[1].strip()
if 'architecture' in k: info['family'] = v
elif 'parameters' in k: info['params_str'] = v
elif 'quantization' in k: info['quant'] = v
elif 'context' in k and 'length' in k:
if v.isdigit(): info['context'] = int(v)
# Fallback regex for context
if 'context' in line.lower() and info['context'] == 0:
match = re.search(r'context\s+length\s+(\d+)', line, re.IGNORECASE)
if match: info['context'] = int(match.group(1))
# Parse 'Parameters' Section (runtime config)
elif current_section == "Parameters":
if 'num_ctx' in line.lower():
parts = line.split(maxsplit=1)
if len(parts) == 2 and parts[1].strip().isdigit():
info['context_used'] = int(parts[1].strip())
# Parse 'Capabilities' Section
elif current_section == "Capabilities":
cap = line.lower()
if cap in ['tools', 'vision', 'thinking', 'insert']:
info['caps'].append(cap.capitalize())
# -- VRAM Calc --
p_val = parse_parameters(info['params_str'])
info['vram'] = estimate_vram(p_val, info['quant'], info['context'], info['context_used'])
return info
def main():
print("Fetching Ollama inventory...")
list_out = get_cmd_output(['ollama', 'list'])
data = []
lines = list_out.split('\n')[1:]
for line in lines:
if not line.strip(): continue
parts = line.split()
if len(parts) >= 3:
name = parts[0]
disk = parts[2]
print(f" Analyzing {name}...", end='\r')
data.append(get_model_info(name, disk))
print(" " * 60, end='\r')
# Formatting Table
w = {'m': 38, 'a': 12, 'p': 8, 'q': 10, 'ctx': 12, 'cp': 18, 'd': 8, 'v': 8}
header = (f"{'MODEL':<{w['m']}} {'ARCH':<{w['a']}} {'PARAMS':<{w['p']}} "
f"{'QUANT':<{w['q']}} {'CONTEXT':<{w['ctx']}} {'CAPS':<{w['cp']}} "
f"{'DISK':>{w['d']}} {'VRAM':>{w['v']}}")
print(header)
print("-" * len(header))
for r in data:
caps_str = ", ".join(r['caps']) if r['caps'] else "-"
# Truncate overly long names
d_name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
# Format context: show used/max or just max if used not set
if r['context_used'] > 0:
ctx_str = f"{r['context_used']}/{r['context']}"
else:
ctx_str = str(r['context'])
print(f"{d_name:<{w['m']}} {r['family']:<{w['a']}} {r['params_str']:<{w['p']}} "
f"{r['quant']:<{w['q']}} {ctx_str:<{w['ctx']}} {caps_str:<{w['cp']}} "
f"{r['disk']:>{w['d']}} {r['vram']:>{w['v']}}")
if __name__ == "__main__":
main()

164
scripts/ollama-monitor.py Normal file
View File

@@ -0,0 +1,164 @@
#!/usr/bin/env python3
"""
Ollama Monitor - Real-time dashboard for Ollama instances
"""
import urllib.request
import json
import subprocess
import time
import os
import sys
# Terminal colors
CLEAR, BOLD, RESET = "\033[2J\033[H", "\033[1m", "\033[0m"
CYAN, GREEN, YELLOW, MAGENTA, RED = "\033[36m", "\033[32m", "\033[33m", "\033[35m", "\033[31m"
def discover_ollama_instances():
"""Auto-discover running Ollama instances."""
instances = {}
# Try default port
if check_ollama_available("http://localhost:11434"):
instances["Ollama (default)"] = "http://localhost:11434"
# Try common alternative ports
for port in [11435, 11436]:
url = f"http://localhost:{port}"
if check_ollama_available(url):
instances[f"Ollama (port {port})"] = url
return instances
def check_ollama_available(url):
"""Check if an Ollama instance is available at the given URL."""
try:
with urllib.request.urlopen(f"{url}/api/tags", timeout=1) as r:
return r.status == 200
except:
return False
def get_ollama_ps(url):
"""Get running models from Ollama instance."""
try:
with urllib.request.urlopen(f"{url}/api/ps", timeout=0.5) as r:
return json.loads(r.read().decode()).get('models', [])
except Exception:
return None
def get_gpu_metrics():
"""Try to get GPU metrics from AMD GPU sysfs."""
try:
# Try multiple possible GPU device paths
device_paths = [
"/sys/class/drm/card1/device/",
"/sys/class/drm/card0/device/",
]
for base_path in device_paths:
if not os.path.exists(base_path):
continue
try:
with open(base_path + "mem_info_vram_used", "r") as f:
used = int(f.read().strip()) / 1024 / 1024
with open(base_path + "mem_info_vram_total", "r") as f:
total = int(f.read().strip()) / 1024 / 1024
with open(base_path + "gpu_busy_percent", "r") as f:
load = int(f.read().strip())
# Sanity check: If VRAM usage is low but load is 99%, it's a driver glitch
if load == 99 and used < (total * 0.1):
load = 0
return used, total, load
except:
continue
return None, None, None
except:
return None, None, None
def get_sys_metrics():
"""Get system CPU and RAM metrics."""
try:
load_avg = os.getloadavg()[0]
mem_output = subprocess.check_output("free -m", shell=True).decode().split('\n')[1].split()
ram_used = int(mem_output[2])
ram_total = int(mem_output[1])
return load_avg, ram_used, ram_total
except Exception:
return 0.0, 0, 0
def draw(instances):
"""Draw the monitoring dashboard."""
load_avg, ram_used, ram_total = get_sys_metrics()
vram_used, vram_total, gpu_load = get_gpu_metrics()
out = [f"{CLEAR}{BOLD}{CYAN}=== OLLAMA MONITOR ==={RESET}"]
# System metrics
out.append(f"{BOLD}CPU Load:{RESET} {YELLOW}{load_avg:.2f}{RESET} | "
f"{BOLD}RAM:{RESET} {MAGENTA}{ram_used}MB/{ram_total}MB{RESET}", )
# GPU metrics (if available)
if vram_total is not None and gpu_load is not None:
load_color = GREEN if gpu_load < 80 else RED
out.append(f"{BOLD}GPU Load:{RESET} {load_color}{gpu_load}%{RESET} | "
f"{BOLD}VRAM:{RESET} {CYAN}{vram_used:.0f}MB/{vram_total:.0f}MB{RESET}")
out.append("" * 70)
# Ollama instances
for name, url in instances.items():
models = get_ollama_ps(url)
status = f"{GREEN}ONLINE{RESET}" if models is not None else f"{RED}OFFLINE{RESET}"
out.append(f"\n{BOLD}{name}{RESET} [{status}] - {url}")
if models:
if len(models) > 0:
out.append(f" {'MODEL':<40} {'SIZE':<12} {'UNTIL':<20}")
for m in models:
size_gb = m.get('size', 0) / (1024**3)
until = m.get('expires_at', 'N/A')
if until != 'N/A' and 'T' in until:
# Parse ISO timestamp and show relative time
until = until.split('T')[1].split('.')[0]
out.append(f" {m['name'][:39]:<40} {size_gb:>6.1f} GB {until}")
else:
out.append(f" {YELLOW}IDLE{RESET}")
elif models is None:
out.append(f" {RED}Connection failed{RESET}")
print("\n".join(out) + f"\n\n{BOLD}{CYAN}Refreshing... (Ctrl+C to quit){RESET}")
def main():
print("Discovering Ollama instances...")
instances = discover_ollama_instances()
if not instances:
print(f"{RED}✗ No Ollama instances found.{RESET}")
print(" Make sure Ollama is running on the default port (11434)")
sys.exit(1)
print(f"Found {len(instances)} instance(s). Starting monitor...\n")
time.sleep(1)
try:
while True:
draw(instances)
time.sleep(1)
except KeyboardInterrupt:
print("\nMonitor stopped.")
if __name__ == "__main__":
main()

274
scripts/vram-test.py Executable file
View File

@@ -0,0 +1,274 @@
#!/usr/bin/env python3
"""
Ollama VRAM Test - Evaluate if models fit in VRAM
Tests models with their configured parameters and reports VRAM usage and CPU offloading.
"""
import argparse
import json
import subprocess
import sys
import time
import urllib.request
from typing import Dict, List, Optional
def get_ollama_url():
"""Get the Ollama API URL."""
return "http://localhost:11434"
def get_installed_models() -> List[str]:
"""Get list of installed Ollama models."""
try:
result = subprocess.run(
['ollama', 'list'],
capture_output=True,
text=True,
check=True
)
models = []
for line in result.stdout.strip().split('\n')[1:]: # Skip header
if line.strip():
name = line.split()[0]
models.append(name)
return models
except subprocess.CalledProcessError:
return []
def get_model_info(model_name: str) -> Dict:
"""Get model information from ollama show."""
try:
result = subprocess.run(
['ollama', 'show', model_name],
capture_output=True,
text=True,
check=True
)
info = {
'size': 'N/A',
'quant': 'N/A',
'num_ctx': 'N/A',
'params': 'N/A'
}
current_section = None
for line in result.stdout.split('\n'):
line = line.strip()
if not line:
continue
if line in ["Model", "Parameters"]:
current_section = line
continue
if current_section == "Model":
parts = line.split(maxsplit=1)
if len(parts) == 2:
k, v = parts[0].lower(), parts[1].strip()
if 'quantization' in k:
info['quant'] = v
elif 'parameters' in k:
info['params'] = v
elif current_section == "Parameters":
if 'num_ctx' in line.lower():
parts = line.split(maxsplit=1)
if len(parts) == 2:
info['num_ctx'] = parts[1].strip()
return info
except subprocess.CalledProcessError:
return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}
def test_model_vram(model_name: str) -> Dict:
"""
Test a model's VRAM usage by loading it with a minimal prompt.
Returns dict with model stats and VRAM usage.
"""
print(f"Testing {model_name}...", end=' ', flush=True)
# Get model info first
info = get_model_info(model_name)
# Send a minimal test prompt to force model loading
url = f"{get_ollama_url()}/api/generate"
prompt_data = {
"model": model_name,
"prompt": "Reply with only: OK",
"stream": False
}
try:
req = urllib.request.Request(
url,
data=json.dumps(prompt_data).encode('utf-8'),
headers={'Content-Type': 'application/json'}
)
# Send request and wait for model to load
with urllib.request.urlopen(req, timeout=30) as response:
response.read() # Wait for completion
# Give it a moment to stabilize
time.sleep(0.5)
# Now check /api/ps for VRAM usage
ps_url = f"{get_ollama_url()}/api/ps"
with urllib.request.urlopen(ps_url, timeout=5) as r:
ps_data = json.loads(r.read().decode())
models = ps_data.get('models', [])
# Find our model in the running models
for m in models:
if m['name'] == model_name or m['name'].startswith(model_name + ':'):
size_bytes = m.get('size', 0)
size_vram = m.get('size_vram', 0)
# Calculate VRAM usage in GB
vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0
# Calculate offload percentage (how much is on CPU)
if size_bytes > 0:
offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
else:
offload_pct = 0
print("")
return {
'model': model_name,
'params': info['params'],
'size_gb': total_gb,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': vram_gb,
'offload_pct': offload_pct,
'success': True
}
# Model not found in ps output
print("✗ (not in ps)")
return {
'model': model_name,
'params': info['params'],
'size_gb': 0,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': 0,
'offload_pct': 0,
'success': False
}
except Exception as e:
print(f"✗ ({str(e)[:30]})")
return {
'model': model_name,
'params': info['params'],
'size_gb': 0,
'quant': info['quant'],
'num_ctx': info['num_ctx'],
'vram_gb': 0,
'offload_pct': 0,
'success': False
}
def main():
parser = argparse.ArgumentParser(
description='Test Ollama models for VRAM usage and CPU offloading',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Test all installed models
%(prog)s
# Test a specific model
%(prog)s ministral-3:3b-instruct-2512-q5_k_m
"""
)
parser.add_argument(
'model',
nargs='?',
help='Specific model to test (optional, tests all if omitted)'
)
args = parser.parse_args()
# Check if ollama is available
try:
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
sys.exit(1)
# Determine which models to test
if args.model:
models = [args.model]
else:
models = get_installed_models()
if not models:
print("✗ No models found")
sys.exit(1)
print(f"Found {len(models)} installed model(s)\n")
# Test each model
results = []
for model in models:
result = test_model_vram(model)
results.append(result)
# Display results table
print("\n" + "="*110)
print("VRAM USAGE TEST RESULTS")
print("="*110)
# Column widths
w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}
header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")
print(header)
print("-" * 110)
for r in results:
# Truncate long model names
name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
# Format values
size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"
# Offload status
if r['success']:
if r['offload_pct'] > 0:
offload_str = f"{r['offload_pct']:.1f}% CPU"
else:
offload_str = "0% (GPU only)"
else:
offload_str = "Failed"
print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")
# Summary
successful = sum(1 for r in results if r['success'])
with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)
print("\n" + "="*110)
print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")
if with_offload > 0:
print(f"\n{with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")
if __name__ == '__main__':
main()