initial commit
This commit is contained in:
557
scripts/context-optimizer.py
Executable file
557
scripts/context-optimizer.py
Executable file
@@ -0,0 +1,557 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ollama Context Optimizer - Find optimal num_ctx for models based on VRAM
|
||||
Iteratively tests different context sizes to recommend the best setting.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
|
||||
def get_ollama_url():
|
||||
"""Get the Ollama API URL."""
|
||||
return "http://localhost:11434"
|
||||
|
||||
|
||||
def get_gpu_vram() -> Tuple[Optional[float], Optional[float]]:
|
||||
"""Get GPU VRAM total and available in GB."""
|
||||
import os
|
||||
|
||||
device_paths = [
|
||||
"/sys/class/drm/card1/device/",
|
||||
"/sys/class/drm/card0/device/",
|
||||
]
|
||||
|
||||
for base_path in device_paths:
|
||||
if not os.path.exists(base_path):
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(base_path + "mem_info_vram_used", "r") as f:
|
||||
used = int(f.read().strip()) / 1024 / 1024 / 1024 # GB
|
||||
with open(base_path + "mem_info_vram_total", "r") as f:
|
||||
total = int(f.read().strip()) / 1024 / 1024 / 1024 # GB
|
||||
|
||||
available = total - used
|
||||
return total, available
|
||||
except:
|
||||
continue
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def get_model_info(model_name: str) -> Dict:
|
||||
"""Get model information including max context capability."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ollama', 'show', model_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
info = {
|
||||
'max_context': 0,
|
||||
'current_num_ctx': 0,
|
||||
'params': 'N/A',
|
||||
'quant': 'N/A'
|
||||
}
|
||||
|
||||
current_section = None
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line in ["Model", "Parameters"]:
|
||||
current_section = line
|
||||
continue
|
||||
|
||||
if current_section == "Model":
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
k, v = parts[0].lower(), parts[1].strip()
|
||||
if 'context' in k and 'length' in k:
|
||||
if v.isdigit():
|
||||
info['max_context'] = int(v)
|
||||
elif 'context' in k:
|
||||
# Handle "context length" as two words
|
||||
parts2 = line.split()
|
||||
if len(parts2) >= 3 and parts2[-1].isdigit():
|
||||
info['max_context'] = int(parts2[-1])
|
||||
elif 'quantization' in k:
|
||||
info['quant'] = v
|
||||
elif 'parameters' in k:
|
||||
info['params'] = v
|
||||
|
||||
elif current_section == "Parameters":
|
||||
if 'num_ctx' in line.lower():
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2 and parts[1].strip().isdigit():
|
||||
info['current_num_ctx'] = int(parts[1].strip())
|
||||
|
||||
return info
|
||||
except subprocess.CalledProcessError:
|
||||
return {'max_context': 0, 'current_num_ctx': 0, 'params': 'N/A', 'quant': 'N/A'}
|
||||
|
||||
|
||||
def test_context_size(model_name: str, num_ctx: int) -> Optional[Dict]:
|
||||
"""
|
||||
Test a model with a specific context size.
|
||||
Returns VRAM usage and offload info, or None if failed.
|
||||
"""
|
||||
url = f"{get_ollama_url()}/api/generate"
|
||||
prompt_data = {
|
||||
"model": model_name,
|
||||
"prompt": "Reply with only: OK",
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_ctx": num_ctx
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=json.dumps(prompt_data).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
# Send request with longer timeout for large contexts
|
||||
# Large contexts can take time to allocate
|
||||
timeout = 60 if num_ctx > 100000 else 30
|
||||
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
response_data = response.read().decode()
|
||||
|
||||
# Check if response contains error
|
||||
try:
|
||||
resp_json = json.loads(response_data)
|
||||
if 'error' in resp_json:
|
||||
error_msg = resp_json['error']
|
||||
# Return special dict to indicate OOM or other errors
|
||||
return {
|
||||
'vram_gb': 0,
|
||||
'total_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'num_ctx': num_ctx,
|
||||
'error': error_msg
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Wait for model to stabilize
|
||||
time.sleep(0.5)
|
||||
|
||||
# Check /api/ps for VRAM usage
|
||||
ps_url = f"{get_ollama_url()}/api/ps"
|
||||
with urllib.request.urlopen(ps_url, timeout=5) as r:
|
||||
ps_data = json.loads(r.read().decode())
|
||||
models = ps_data.get('models', [])
|
||||
|
||||
for m in models:
|
||||
if m['name'] == model_name or m['name'].startswith(model_name + ':'):
|
||||
size_bytes = m.get('size', 0)
|
||||
size_vram = m.get('size_vram', 0)
|
||||
|
||||
vram_gb = size_vram / (1024**3)
|
||||
total_gb = size_bytes / (1024**3)
|
||||
|
||||
offload_pct = 0
|
||||
if size_bytes > 0:
|
||||
offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
|
||||
|
||||
return {
|
||||
'vram_gb': vram_gb,
|
||||
'total_gb': total_gb,
|
||||
'offload_pct': offload_pct,
|
||||
'num_ctx': num_ctx
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
# HTTP errors (500, etc.) - often indicates OOM or model loading failure
|
||||
try:
|
||||
error_body = e.read().decode()
|
||||
error_data = json.loads(error_body)
|
||||
error_msg = error_data.get('error', str(e))
|
||||
except:
|
||||
error_msg = f"HTTP {e.code}"
|
||||
|
||||
return {
|
||||
'vram_gb': 0,
|
||||
'total_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'num_ctx': num_ctx,
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
except urllib.error.URLError as e:
|
||||
# Network/timeout errors
|
||||
if 'timed out' in str(e).lower():
|
||||
error_msg = "Timeout (loading too slow)"
|
||||
else:
|
||||
error_msg = f"Connection error: {e}"
|
||||
|
||||
return {
|
||||
'vram_gb': 0,
|
||||
'total_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'num_ctx': num_ctx,
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Other unexpected errors
|
||||
return {
|
||||
'vram_gb': 0,
|
||||
'total_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'num_ctx': num_ctx,
|
||||
'error': str(e)[:50]
|
||||
}
|
||||
|
||||
|
||||
def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: float) -> Dict:
|
||||
"""
|
||||
Find optimal context size through intelligent testing.
|
||||
Uses VRAM measurements to extrapolate optimal size.
|
||||
|
||||
Args:
|
||||
model_name: Name of the Ollama model to test
|
||||
max_turns: Maximum iterations (None = optimize until convergence)
|
||||
overhead_gb: VRAM to keep free for system overhead
|
||||
"""
|
||||
print(f"Analyzing model: {model_name}")
|
||||
print("-" * 70)
|
||||
|
||||
# Get model capabilities
|
||||
info = get_model_info(model_name)
|
||||
max_context = info['max_context']
|
||||
current_ctx = info['current_num_ctx']
|
||||
|
||||
print(f"Model: {model_name}")
|
||||
print(f"Parameters: {info['params']} ({info['quant']})")
|
||||
print(f"Max context capability: {max_context:,}")
|
||||
print(f"Current num_ctx: {current_ctx:,}")
|
||||
|
||||
if max_context == 0:
|
||||
print("\n✗ Could not determine model's max context capability")
|
||||
return {}
|
||||
|
||||
# Get VRAM info
|
||||
vram_total, vram_available = get_gpu_vram()
|
||||
if vram_total:
|
||||
print(f"GPU VRAM: {vram_available:.1f} GB available / {vram_total:.1f} GB total")
|
||||
print(f"Overhead reserved: {overhead_gb:.1f} GB")
|
||||
# Reserve specified overhead
|
||||
target_vram = vram_total - overhead_gb
|
||||
else:
|
||||
print("⚠ Could not detect GPU VRAM (testing will continue)")
|
||||
target_vram = None
|
||||
|
||||
if max_turns:
|
||||
print(f"Testing with max {max_turns} iterations...")
|
||||
else:
|
||||
print(f"Testing until convergence (num_ctx must be multiple of 2048)...")
|
||||
print()
|
||||
|
||||
results = []
|
||||
|
||||
# Turn 1: Test current setting to establish baseline
|
||||
test_ctx = current_ctx if current_ctx > 0 else 8192
|
||||
turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
|
||||
print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
|
||||
result = test_context_size(model_name, test_ctx)
|
||||
|
||||
if result and 'error' not in result:
|
||||
results.append(result)
|
||||
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
|
||||
baseline_vram = result['vram_gb']
|
||||
baseline_ctx = test_ctx
|
||||
else:
|
||||
print("✗ Failed")
|
||||
return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info}
|
||||
|
||||
# Turn 2: Test a higher context to calculate VRAM/context ratio
|
||||
# Try doubling the context or 32K, whichever is smaller
|
||||
test_ctx_2 = min(baseline_ctx * 2, 32768, max_context)
|
||||
if test_ctx_2 <= baseline_ctx:
|
||||
test_ctx_2 = min(baseline_ctx + 16384, max_context)
|
||||
# Round to multiple of 2048
|
||||
test_ctx_2 = (test_ctx_2 // 2048) * 2048
|
||||
|
||||
turn_label = f"Turn 2/{max_turns}" if max_turns else "Turn 2"
|
||||
print(f"{turn_label}: Testing num_ctx={test_ctx_2:,} (calibration)...", end=' ', flush=True)
|
||||
result = test_context_size(model_name, test_ctx_2)
|
||||
|
||||
if result and 'error' not in result:
|
||||
results.append(result)
|
||||
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
|
||||
|
||||
# Calculate VRAM per 1K context tokens
|
||||
vram_diff = result['vram_gb'] - baseline_vram
|
||||
ctx_diff = test_ctx_2 - baseline_ctx
|
||||
if ctx_diff > 0:
|
||||
vram_per_1k_ctx = (vram_diff / ctx_diff) * 1000
|
||||
print(f" → Estimated VRAM usage: {vram_per_1k_ctx:.4f} GB per 1K context")
|
||||
|
||||
# Predict optimal context size
|
||||
if target_vram and vram_per_1k_ctx > 0:
|
||||
available_for_ctx = target_vram - baseline_vram
|
||||
estimated_additional_ctx = (available_for_ctx / vram_per_1k_ctx) * 1000
|
||||
predicted_optimal = baseline_ctx + int(estimated_additional_ctx)
|
||||
# Round to multiple of 2048
|
||||
predicted_optimal = (predicted_optimal // 2048) * 2048
|
||||
predicted_optimal = max(baseline_ctx, min(predicted_optimal, max_context))
|
||||
|
||||
print(f" → Predicted optimal context: {predicted_optimal:,}")
|
||||
else:
|
||||
predicted_optimal = None
|
||||
vram_per_1k_ctx = None
|
||||
else:
|
||||
vram_per_1k_ctx = None
|
||||
predicted_optimal = None
|
||||
else:
|
||||
if result and 'error' in result:
|
||||
error_msg = result['error']
|
||||
if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
|
||||
print(f"✗ OOM (out of memory)")
|
||||
else:
|
||||
print(f"✗ Error: {error_msg[:30]}")
|
||||
else:
|
||||
print("✗ Failed")
|
||||
vram_per_1k_ctx = None
|
||||
predicted_optimal = None
|
||||
|
||||
# Remaining turns: Test predicted optimal or use VRAM-based refinement
|
||||
min_ctx = baseline_ctx
|
||||
max_ctx = max_context
|
||||
|
||||
turn = 2
|
||||
while True:
|
||||
# Check if we should stop
|
||||
if max_turns and turn >= max_turns:
|
||||
break
|
||||
|
||||
if predicted_optimal and turn == 2:
|
||||
# Turn 3: Test predicted optimal
|
||||
test_ctx = predicted_optimal
|
||||
turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
|
||||
print(f"{turn_label}: Testing num_ctx={test_ctx:,} (predicted optimal)...", end=' ', flush=True)
|
||||
else:
|
||||
# Use VRAM-based prediction if we have the data
|
||||
if vram_per_1k_ctx and target_vram and len(results) > 0:
|
||||
# Find the last successful result (no offload)
|
||||
last_good = None
|
||||
for r in reversed(results):
|
||||
if r['offload_pct'] == 0:
|
||||
last_good = r
|
||||
break
|
||||
|
||||
if last_good and target_vram:
|
||||
# Calculate how much more context we can realistically add
|
||||
available_vram = target_vram - last_good['vram_gb']
|
||||
|
||||
# Calculate potential additional context
|
||||
additional_ctx = (available_vram / vram_per_1k_ctx) * 1000
|
||||
|
||||
# If we can only add < 8K context, do small increments
|
||||
if additional_ctx < 8192:
|
||||
# Small increments - round up to next 2048 boundary
|
||||
test_ctx = last_good['num_ctx'] + 2048
|
||||
test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
|
||||
else:
|
||||
# Larger headroom - use 60% of predicted to be conservative
|
||||
test_ctx = last_good['num_ctx'] + int(additional_ctx * 0.6)
|
||||
test_ctx = (test_ctx // 2048) * 2048
|
||||
test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
|
||||
else:
|
||||
# No good result yet - binary search
|
||||
test_ctx = (min_ctx + max_ctx) // 2
|
||||
test_ctx = (test_ctx // 2048) * 2048
|
||||
else:
|
||||
# No VRAM data - fall back to binary search
|
||||
test_ctx = (min_ctx + max_ctx) // 2
|
||||
test_ctx = (test_ctx // 2048) * 2048
|
||||
|
||||
# Avoid retesting same value
|
||||
if any(r['num_ctx'] == test_ctx for r in results):
|
||||
# Adjust by 2048
|
||||
if test_ctx < max_ctx:
|
||||
test_ctx += 2048
|
||||
else:
|
||||
test_ctx -= 2048
|
||||
|
||||
if test_ctx <= min_ctx or test_ctx >= max_ctx:
|
||||
print(f"\nConverged after {turn + 1} turns")
|
||||
break
|
||||
|
||||
turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
|
||||
print(f"{turn_label}: Testing num_ctx={test_ctx:,}...", end=' ', flush=True)
|
||||
|
||||
result = test_context_size(model_name, test_ctx)
|
||||
|
||||
if result is None:
|
||||
print("✗ Failed (model not found)")
|
||||
max_ctx = test_ctx
|
||||
continue
|
||||
|
||||
if 'error' in result:
|
||||
error_msg = result['error']
|
||||
if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
|
||||
print(f"✗ OOM (out of memory)")
|
||||
elif 'timeout' in error_msg.lower():
|
||||
print(f"✗ Timeout")
|
||||
else:
|
||||
print(f"✗ Error: {error_msg[:30]}")
|
||||
max_ctx = test_ctx
|
||||
continue
|
||||
|
||||
results.append(result)
|
||||
|
||||
offload_str = f"{result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else "GPU only"
|
||||
print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {offload_str}")
|
||||
|
||||
# Adjust search bounds
|
||||
if result['offload_pct'] > 0:
|
||||
max_ctx = test_ctx
|
||||
else:
|
||||
min_ctx = test_ctx
|
||||
|
||||
# Stop if we're converging (within one step of 2048)
|
||||
if max_ctx - min_ctx <= 2048:
|
||||
print(f"\nConverged after {turn + 1} turns")
|
||||
break
|
||||
|
||||
turn += 1
|
||||
|
||||
return {
|
||||
'model': model_name,
|
||||
'results': results,
|
||||
'max_context': max_context,
|
||||
'current_ctx': current_ctx,
|
||||
'vram_total': vram_total,
|
||||
'info': info
|
||||
}
|
||||
|
||||
|
||||
def print_recommendation(analysis: Dict):
|
||||
"""Print optimization recommendations."""
|
||||
if not analysis or not analysis.get('results'):
|
||||
print("\n✗ No results to analyze")
|
||||
return
|
||||
|
||||
results = analysis['results']
|
||||
max_context = analysis['max_context']
|
||||
current_ctx = analysis['current_ctx']
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("OPTIMIZATION RECOMMENDATION")
|
||||
print("="*70)
|
||||
|
||||
# Find best context without offloading
|
||||
no_offload = [r for r in results if r['offload_pct'] == 0]
|
||||
|
||||
if no_offload:
|
||||
# Recommend highest context without offloading
|
||||
best = max(no_offload, key=lambda x: x['num_ctx'])
|
||||
|
||||
print(f"\n✓ Recommended num_ctx: {best['num_ctx']:,}")
|
||||
print(f" VRAM usage: {best['vram_gb']:.2f} GB")
|
||||
print(f" Status: Fits entirely in GPU memory")
|
||||
|
||||
if best['num_ctx'] < max_context:
|
||||
print(f"\n⚠ Note: Model supports up to {max_context:,} context")
|
||||
print(f" but VRAM limits optimal usage to {best['num_ctx']:,}")
|
||||
|
||||
if current_ctx != best['num_ctx']:
|
||||
print(f"\n📝 Suggested Modelfile change:")
|
||||
print(f" Current: PARAMETER num_ctx {current_ctx}")
|
||||
print(f" Optimal: PARAMETER num_ctx {best['num_ctx']}")
|
||||
else:
|
||||
# All tests had offloading
|
||||
print("\n⚠ All tested configurations require CPU offloading")
|
||||
|
||||
# Find least offloading
|
||||
least_offload = min(results, key=lambda x: x['offload_pct'])
|
||||
|
||||
print(f"\n Least offloading at num_ctx={least_offload['num_ctx']:,}")
|
||||
print(f" CPU offload: {least_offload['offload_pct']:.1f}%")
|
||||
print(f" VRAM usage: {least_offload['vram_gb']:.2f} GB")
|
||||
|
||||
print(f"\n💡 Recommendations:")
|
||||
print(f" 1. Use lower quantization (Q4 instead of Q5/Q8)")
|
||||
print(f" 2. Reduce num_ctx to {least_offload['num_ctx']:,} or lower")
|
||||
print(f" 3. Consider a smaller model variant")
|
||||
|
||||
# VRAM efficiency
|
||||
print(f"\n📊 Tested context sizes:")
|
||||
for r in sorted(results, key=lambda x: x['num_ctx']):
|
||||
status = "✓" if r['offload_pct'] == 0 else "✗"
|
||||
print(f" {status} {r['num_ctx']:>6,}: {r['vram_gb']:>5.2f} GB VRAM, "
|
||||
f"{r['offload_pct']:>4.1f}% CPU offload")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Optimize Ollama model context size for VRAM constraints',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Optimize with 5 test iterations (default)
|
||||
%(prog)s ministral-3:3b-instruct-2512-q5_k_m
|
||||
|
||||
# Use 10 iterations for more precise optimization
|
||||
%(prog)s ministral-3:3b-instruct-2512-q5_k_m --turns 10
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'model',
|
||||
help='Model name to optimize'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--turns',
|
||||
type=int,
|
||||
default=None,
|
||||
help='Maximum number of test iterations (default: optimize until convergence)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--overhead',
|
||||
type=float,
|
||||
default=1.0,
|
||||
help='VRAM overhead to keep free in GB (default: 1.0)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.turns is not None and args.turns < 2:
|
||||
print("✗ Error: --turns must be at least 2")
|
||||
sys.exit(1)
|
||||
|
||||
# Check if ollama is available
|
||||
try:
|
||||
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
|
||||
sys.exit(1)
|
||||
|
||||
# Run optimization
|
||||
analysis = find_optimal_context(args.model, args.turns, args.overhead)
|
||||
|
||||
# Print recommendations
|
||||
print_recommendation(analysis)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
638
scripts/hf-llm-install.py
Executable file
638
scripts/hf-llm-install.py
Executable file
@@ -0,0 +1,638 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HuggingFace LLM Installer for Ollama
|
||||
Automatically downloads GGUF files from HuggingFace and creates Ollama models.
|
||||
|
||||
Features:
|
||||
- SHA256 checksum verification
|
||||
- Disk space checking
|
||||
- Dry run mode
|
||||
- Parallel processing
|
||||
- Skip existing models
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import urllib.request
|
||||
|
||||
|
||||
def parse_model_name_from_gguf(gguf_filename):
|
||||
"""
|
||||
Parse model name and tag from GGUF filename.
|
||||
|
||||
Args:
|
||||
gguf_filename: Name of the GGUF file
|
||||
|
||||
Returns:
|
||||
Tuple of (model_base, tag, full_name) or (filename, 'latest', filename) if parsing fails
|
||||
"""
|
||||
filename_stem = Path(gguf_filename).stem.lower()
|
||||
|
||||
# Split on hyphens
|
||||
parts = filename_stem.split('-')
|
||||
if len(parts) >= 3:
|
||||
# Find where the size variant starts (e.g., "0.5b", "3b", "8b", "14b")
|
||||
base_parts = []
|
||||
tag_parts = []
|
||||
found_variant = False
|
||||
|
||||
for part in parts:
|
||||
# Check if this looks like a size variant (e.g., "3b", "8b", "0.5b")
|
||||
if not found_variant and re.match(r'^\d+(\.\d+)?b$', part):
|
||||
found_variant = True
|
||||
tag_parts.append(part)
|
||||
elif found_variant:
|
||||
# Include everything after the variant (including quantization)
|
||||
tag_parts.append(part)
|
||||
else:
|
||||
# Before the variant = base name
|
||||
base_parts.append(part)
|
||||
|
||||
if base_parts and tag_parts:
|
||||
model_base = '-'.join(base_parts)
|
||||
model_tag = '-'.join(tag_parts)
|
||||
full_name = f"{model_base}:{model_tag}"
|
||||
return (model_base, model_tag, full_name)
|
||||
|
||||
# Fallback to filename without extension
|
||||
return (filename_stem, 'latest', filename_stem)
|
||||
|
||||
|
||||
def parse_modelfile(modelfile_path):
|
||||
"""
|
||||
Parse a Modelfile to extract HuggingFace upstream URL and model info.
|
||||
|
||||
Args:
|
||||
modelfile_path: Path to the .Modelfile
|
||||
|
||||
Returns:
|
||||
dict with model metadata or None if invalid
|
||||
"""
|
||||
with open(modelfile_path, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Look for hf_upstream in the header comments
|
||||
hf_match = re.search(r'#\s*hf_upstream:\s*(https://huggingface\.co/[^\s]+)', content)
|
||||
if not hf_match:
|
||||
return None
|
||||
|
||||
hf_url = hf_match.group(1)
|
||||
|
||||
# Look for optional quantization specification (default: q4_k_m)
|
||||
quant_match = re.search(r'#\s*quantization:\s*([a-zA-Z0-9_]+)', content)
|
||||
quantization = quant_match.group(1).upper() if quant_match else 'Q4_K_M'
|
||||
|
||||
# Look for optional SHA256 checksum
|
||||
sha256_match = re.search(r'#\s*sha256:\s*([a-fA-F0-9]{64})', content)
|
||||
sha256 = sha256_match.group(1) if sha256_match else None
|
||||
|
||||
# Look for optional capabilities (comma-separated list)
|
||||
# Format: # capabilities: tools, vision
|
||||
capabilities_match = re.search(r'#\s*capabilities:\s*([^\n]+)', content)
|
||||
capabilities = None
|
||||
if capabilities_match:
|
||||
# Parse comma-separated capabilities and clean whitespace
|
||||
caps_str = capabilities_match.group(1).strip()
|
||||
capabilities = [cap.strip() for cap in caps_str.split(',') if cap.strip()]
|
||||
|
||||
# Check if URL points to a specific GGUF file or just the repo
|
||||
if hf_url.endswith('.gguf') or '/blob/' in hf_url or '/resolve/' in hf_url:
|
||||
# Specific file provided - use as-is
|
||||
resolve_url = hf_url.replace('/blob/', '/resolve/')
|
||||
gguf_filename = os.path.basename(urlparse(resolve_url).path)
|
||||
else:
|
||||
# Repository root provided - construct filename from repo name and quantization
|
||||
# URL format: https://huggingface.co/{org}/{repo}
|
||||
url_parts = urlparse(hf_url).path.strip('/').split('/')
|
||||
if len(url_parts) >= 2:
|
||||
repo_name = url_parts[1] # e.g., "Ministral-3-3B-Instruct-2512-GGUF"
|
||||
|
||||
# Remove -GGUF suffix if present (case-insensitive)
|
||||
if repo_name.upper().endswith('-GGUF'):
|
||||
repo_name = repo_name[:-5]
|
||||
|
||||
# Construct filename: RepoName-Quantization.gguf
|
||||
gguf_filename = f"{repo_name}-{quantization}.gguf"
|
||||
resolve_url = f"{hf_url.rstrip('/')}/resolve/main/{gguf_filename}"
|
||||
else:
|
||||
print(f"✗ Invalid HuggingFace URL format: {hf_url}")
|
||||
return None
|
||||
|
||||
# Extract model name and tag from the GGUF filename
|
||||
# Format: Model-Version-Variant-Year-Quant.gguf -> model:version-variant-year-quant
|
||||
# Example: Ministral-3-3B-Instruct-2512-Q5_K_M.gguf -> ministral-3:3b-instruct-2512-q5_k_m
|
||||
model_base, model_tag, model_name = parse_model_name_from_gguf(gguf_filename)
|
||||
|
||||
return {
|
||||
'hf_url': hf_url,
|
||||
'resolve_url': resolve_url,
|
||||
'gguf_filename': gguf_filename,
|
||||
'model_name': model_name,
|
||||
'modelfile_path': modelfile_path,
|
||||
'sha256': sha256,
|
||||
'capabilities': capabilities
|
||||
}
|
||||
|
||||
|
||||
def get_file_size(url):
|
||||
"""
|
||||
Get the size of a file from URL without downloading it.
|
||||
|
||||
Args:
|
||||
url: File URL
|
||||
|
||||
Returns:
|
||||
Size in bytes or None if unavailable
|
||||
"""
|
||||
try:
|
||||
req = urllib.request.Request(url, method='HEAD')
|
||||
with urllib.request.urlopen(req, timeout=10) as response:
|
||||
size = response.headers.get('Content-Length')
|
||||
return int(size) if size else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def check_disk_space(required_bytes, path='.'):
|
||||
"""
|
||||
Check if there's enough disk space available.
|
||||
|
||||
Args:
|
||||
required_bytes: Required space in bytes
|
||||
path: Path to check space on (default: current directory)
|
||||
|
||||
Returns:
|
||||
Tuple of (has_space, available_bytes, required_bytes)
|
||||
"""
|
||||
# Get absolute path to check actual filesystem
|
||||
abs_path = os.path.abspath(path)
|
||||
stat = shutil.disk_usage(abs_path)
|
||||
# Add 10% safety margin
|
||||
required_with_margin = int(required_bytes * 1.1)
|
||||
return (stat.free >= required_with_margin, stat.free, required_with_margin)
|
||||
|
||||
|
||||
def calculate_sha256(filepath, chunk_size=8192):
|
||||
"""
|
||||
Calculate SHA256 checksum of a file.
|
||||
|
||||
Args:
|
||||
filepath: Path to file
|
||||
chunk_size: Bytes to read at once
|
||||
|
||||
Returns:
|
||||
SHA256 hex digest
|
||||
"""
|
||||
sha256_hash = hashlib.sha256()
|
||||
with open(filepath, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(chunk_size), b''):
|
||||
sha256_hash.update(chunk)
|
||||
return sha256_hash.hexdigest()
|
||||
|
||||
|
||||
def verify_checksum(filepath, expected_sha256):
|
||||
"""
|
||||
Verify file checksum matches expected value.
|
||||
|
||||
Args:
|
||||
filepath: Path to file
|
||||
expected_sha256: Expected SHA256 hash
|
||||
|
||||
Returns:
|
||||
True if match, False otherwise
|
||||
"""
|
||||
print(f" Verifying checksum...")
|
||||
actual = calculate_sha256(filepath)
|
||||
|
||||
if actual.lower() == expected_sha256.lower():
|
||||
print(f" ✓ Checksum verified: {actual[:16]}...")
|
||||
return True
|
||||
else:
|
||||
print(f" ✗ Checksum mismatch!")
|
||||
print(f" Expected: {expected_sha256}")
|
||||
print(f" Actual: {actual}")
|
||||
return False
|
||||
|
||||
|
||||
def get_existing_models():
|
||||
"""
|
||||
Get list of existing Ollama models.
|
||||
|
||||
Returns:
|
||||
Set of model names
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ollama', 'list'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
# Parse output to get model names
|
||||
# Format: NAME ID SIZE MODIFIED
|
||||
models = set()
|
||||
for line in result.stdout.strip().split('\n')[1:]: # Skip header
|
||||
if line.strip():
|
||||
# Get first column (name)
|
||||
name = line.split()[0]
|
||||
# Remove tag if present
|
||||
base_name = name.split(':')[0]
|
||||
models.add(base_name)
|
||||
|
||||
return models
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
return set()
|
||||
|
||||
|
||||
def download_file(url, dest_path, filename, should_cancel=None, progress_callback=None):
|
||||
"""
|
||||
Download a file from URL to destination with progress indication.
|
||||
|
||||
Args:
|
||||
url: Source URL
|
||||
dest_path: Destination file path
|
||||
filename: Name for display purposes
|
||||
should_cancel: Optional callback function that returns True if download should be cancelled
|
||||
progress_callback: Optional callback function to report progress messages
|
||||
"""
|
||||
def log(msg):
|
||||
"""Helper to print and optionally call progress callback."""
|
||||
print(msg)
|
||||
if progress_callback:
|
||||
progress_callback(msg)
|
||||
|
||||
log(f"Downloading {filename}...")
|
||||
log(f" From: {url}")
|
||||
log(f" To: {dest_path}")
|
||||
|
||||
def show_progress(block_num, block_size, total_size):
|
||||
# Check for cancellation
|
||||
if should_cancel and should_cancel():
|
||||
raise InterruptedError("Download cancelled")
|
||||
|
||||
downloaded = block_num * block_size
|
||||
if total_size > 0:
|
||||
percent = min(100, downloaded * 100 / total_size)
|
||||
mb_downloaded = downloaded / (1024 * 1024)
|
||||
mb_total = total_size / (1024 * 1024)
|
||||
msg = f"\r Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)"
|
||||
print(msg, end='')
|
||||
if progress_callback:
|
||||
progress_callback(f"Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)")
|
||||
|
||||
try:
|
||||
urllib.request.urlretrieve(url, dest_path, show_progress)
|
||||
print() # New line after progress
|
||||
log(f"✓ Download complete")
|
||||
except Exception as e:
|
||||
print(f"\n✗ Download failed: {e}")
|
||||
if progress_callback:
|
||||
progress_callback(f"✗ Download failed: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def create_ollama_model(modelfile_path, gguf_path, model_name, capabilities=None):
|
||||
"""
|
||||
Create an Ollama model from the Modelfile and GGUF file.
|
||||
|
||||
Args:
|
||||
modelfile_path: Path to the .Modelfile
|
||||
gguf_path: Path to the downloaded GGUF file
|
||||
model_name: Name for the Ollama model
|
||||
capabilities: Optional list of capabilities to add (e.g., ['tools', 'vision'])
|
||||
"""
|
||||
print(f"\nCreating Ollama model: {model_name}")
|
||||
|
||||
# Note: Capabilities are detected from the GGUF file metadata by Ollama automatically
|
||||
if capabilities:
|
||||
print(f" ℹ Expected capabilities from GGUF metadata: {', '.join(capabilities)}")
|
||||
|
||||
# Read the Modelfile and update the FROM path to point to the downloaded GGUF
|
||||
with open(modelfile_path, 'r') as f:
|
||||
modelfile_content = f.read()
|
||||
|
||||
# Replace the FROM line to use the actual GGUF path
|
||||
# Handle both relative paths like "./filename.gguf" and URLs like "https://..."
|
||||
original_content = modelfile_content
|
||||
modelfile_content = re.sub(
|
||||
r'FROM\s+(?:\./[^\s]+\.gguf|https?://[^\n]+)',
|
||||
f'FROM {gguf_path}',
|
||||
modelfile_content
|
||||
)
|
||||
|
||||
# Debug: check if replacement happened
|
||||
if original_content == modelfile_content:
|
||||
print(f" WARNING: FROM line was not replaced!")
|
||||
print(f" Looking for pattern in: {original_content[:200]}")
|
||||
else:
|
||||
print(f" ✓ Replaced FROM line with local path: {gguf_path}")
|
||||
|
||||
# Create a temporary Modelfile with the correct path
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.Modelfile', delete=False) as tmp_modelfile:
|
||||
tmp_modelfile.write(modelfile_content)
|
||||
tmp_modelfile_path = tmp_modelfile.name
|
||||
|
||||
try:
|
||||
# Run ollama create
|
||||
cmd = ['ollama', 'create', model_name, '-f', tmp_modelfile_path]
|
||||
print(f" Running: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✓ Model '{model_name}' created successfully")
|
||||
if result.stdout:
|
||||
print(f" {result.stdout.strip()}")
|
||||
else:
|
||||
print(f"✗ Failed to create model")
|
||||
if result.stderr:
|
||||
print(f" Error: {result.stderr.strip()}")
|
||||
raise subprocess.CalledProcessError(result.returncode, cmd)
|
||||
|
||||
finally:
|
||||
# Clean up temporary Modelfile
|
||||
os.unlink(tmp_modelfile_path)
|
||||
|
||||
|
||||
def install_model(modelfile_path, dry_run=False, skip_existing=False, existing_models=None, should_cancel=None, progress_callback=None):
|
||||
"""
|
||||
Install a single model from a Modelfile.
|
||||
|
||||
Args:
|
||||
modelfile_path: Path to the .Modelfile
|
||||
dry_run: If True, only simulate installation
|
||||
skip_existing: If True, skip models already in Ollama
|
||||
existing_models: Set of existing model names
|
||||
should_cancel: Optional callback function that returns True if installation should be cancelled
|
||||
progress_callback: Optional callback function to report progress messages
|
||||
|
||||
Returns:
|
||||
Tuple of (success: bool, skipped: bool, model_name: str)
|
||||
"""
|
||||
def log(msg):
|
||||
"""Helper to print and optionally call progress callback."""
|
||||
print(msg)
|
||||
if progress_callback:
|
||||
progress_callback(msg)
|
||||
log(f"\n{'='*80}")
|
||||
log(f"Processing: {modelfile_path}")
|
||||
log(f"{'='*80}")
|
||||
|
||||
# Parse the Modelfile
|
||||
model_info = parse_modelfile(modelfile_path)
|
||||
if not model_info:
|
||||
log(f"✗ No hf_upstream found in {modelfile_path}")
|
||||
return (False, False, None)
|
||||
|
||||
log(f"Model name: {model_info['model_name']}")
|
||||
log(f"GGUF file: {model_info['gguf_filename']}")
|
||||
if model_info['sha256']:
|
||||
log(f"SHA256: {model_info['sha256'][:16]}...")
|
||||
if model_info.get('capabilities'):
|
||||
log(f"Capabilities: {', '.join(model_info['capabilities'])}")
|
||||
|
||||
# Check if model already exists
|
||||
if skip_existing and existing_models and model_info['model_name'] in existing_models:
|
||||
log(f"⊘ Model '{model_info['model_name']}' already exists, skipping")
|
||||
return (True, True, model_info['model_name'])
|
||||
|
||||
# Get file size and check disk space
|
||||
file_size = get_file_size(model_info['resolve_url'])
|
||||
if file_size:
|
||||
size_gb = file_size / (1024**3)
|
||||
log(f"File size: {size_gb:.2f} GB")
|
||||
|
||||
if not dry_run:
|
||||
has_space, available, required = check_disk_space(file_size)
|
||||
if not has_space:
|
||||
log(f"✗ Insufficient disk space!")
|
||||
log(f" Required: {required / (1024**3):.2f} GB (with 10% margin)")
|
||||
log(f" Available: {available / (1024**3):.2f} GB")
|
||||
return (False, False, model_info['model_name'])
|
||||
else:
|
||||
log(f"✓ Disk space check passed ({available / (1024**3):.2f} GB available)")
|
||||
|
||||
if dry_run:
|
||||
log(f"\n[DRY RUN] Would download and install model: {model_info['model_name']}")
|
||||
return (True, False, model_info['model_name'])
|
||||
|
||||
# Create temporary directory for download
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
gguf_path = os.path.join(tmp_dir, model_info['gguf_filename'])
|
||||
|
||||
try:
|
||||
# Download the GGUF file
|
||||
download_file(model_info['resolve_url'], gguf_path, model_info['gguf_filename'], should_cancel, progress_callback)
|
||||
|
||||
# Verify checksum if provided
|
||||
if model_info['sha256']:
|
||||
if not verify_checksum(gguf_path, model_info['sha256']):
|
||||
print(f"✗ Checksum verification failed!")
|
||||
return (False, False, model_info['model_name'])
|
||||
|
||||
# Create the Ollama model
|
||||
create_ollama_model(
|
||||
modelfile_path,
|
||||
gguf_path,
|
||||
model_info['model_name'],
|
||||
model_info.get('capabilities')
|
||||
)
|
||||
|
||||
print(f"\n✓ Successfully installed model: {model_info['model_name']}")
|
||||
return (True, False, model_info['model_name'])
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Failed to install model: {e}")
|
||||
return (False, False, model_info['model_name'])
|
||||
|
||||
|
||||
def install_model_wrapper(args):
|
||||
"""Wrapper for parallel execution."""
|
||||
return install_model(*args)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Install Ollama models from HuggingFace using Modelfiles',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Install a single model
|
||||
%(prog)s path/to/model.Modelfile
|
||||
|
||||
# Install all models in the default repo directory
|
||||
%(prog)s
|
||||
|
||||
# Dry run to see what would be installed
|
||||
%(prog)s --dry-run
|
||||
|
||||
# Skip models that already exist
|
||||
%(prog)s --skip-existing
|
||||
|
||||
# Install with 3 parallel downloads
|
||||
%(prog)s --parallel 3
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'modelfile',
|
||||
nargs='?',
|
||||
help='Path to a specific .Modelfile to install (optional)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dir',
|
||||
default='./modelfile-repo',
|
||||
help='Directory containing .Modelfile files (default: ./modelfile-repo)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Simulate installation without downloading or creating models'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-existing',
|
||||
action='store_true',
|
||||
help='Skip models that already exist in Ollama'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--parallel',
|
||||
type=int,
|
||||
default=1,
|
||||
metavar='N',
|
||||
help='Number of parallel downloads/installations (default: 1)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate parallel argument
|
||||
if args.parallel < 1:
|
||||
print("✗ Error: --parallel must be at least 1")
|
||||
sys.exit(1)
|
||||
|
||||
# Check if ollama is available
|
||||
try:
|
||||
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
|
||||
print(" Visit: https://ollama.ai")
|
||||
sys.exit(1)
|
||||
|
||||
# Get existing models if skip_existing is enabled
|
||||
existing_models = None
|
||||
if args.skip_existing:
|
||||
existing_models = get_existing_models()
|
||||
if existing_models:
|
||||
print(f"Found {len(existing_models)} existing model(s)")
|
||||
|
||||
# Determine which Modelfiles to process
|
||||
if args.modelfile:
|
||||
# Single file mode
|
||||
modelfile_path = Path(args.modelfile)
|
||||
if not modelfile_path.exists():
|
||||
print(f"✗ Error: File not found: {modelfile_path}")
|
||||
sys.exit(1)
|
||||
|
||||
if not modelfile_path.suffix == '.Modelfile':
|
||||
print(f"✗ Error: File must have .Modelfile extension")
|
||||
sys.exit(1)
|
||||
|
||||
modelfiles = [modelfile_path]
|
||||
else:
|
||||
# Batch mode - process all .Modelfile files in directory
|
||||
modelfile_dir = Path(args.dir)
|
||||
if not modelfile_dir.exists():
|
||||
print(f"✗ Error: Directory not found: {modelfile_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
modelfiles = sorted(modelfile_dir.glob('*.Modelfile'))
|
||||
if not modelfiles:
|
||||
print(f"✗ No .Modelfile files found in {modelfile_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(modelfiles)} Modelfile(s) to process")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n*** DRY RUN MODE - No files will be downloaded or models created ***\n")
|
||||
|
||||
# Process all Modelfiles
|
||||
results = []
|
||||
|
||||
if args.parallel > 1 and len(modelfiles) > 1:
|
||||
# Parallel processing
|
||||
print(f"\nUsing {args.parallel} parallel worker(s)")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
||||
# Submit all tasks
|
||||
future_to_modelfile = {
|
||||
executor.submit(
|
||||
install_model_wrapper,
|
||||
(modelfile, args.dry_run, args.skip_existing, existing_models)
|
||||
): modelfile
|
||||
for modelfile in modelfiles
|
||||
}
|
||||
|
||||
# Collect results as they complete
|
||||
for future in as_completed(future_to_modelfile):
|
||||
modelfile = future_to_modelfile[future]
|
||||
try:
|
||||
success, skipped, model_name = future.result()
|
||||
results.append((modelfile.name, success, skipped))
|
||||
except Exception as e:
|
||||
print(f"\n✗ Exception processing {modelfile.name}: {e}")
|
||||
results.append((modelfile.name, False, False))
|
||||
else:
|
||||
# Sequential processing
|
||||
for modelfile in modelfiles:
|
||||
success, skipped, model_name = install_model(
|
||||
modelfile,
|
||||
args.dry_run,
|
||||
args.skip_existing,
|
||||
existing_models
|
||||
)
|
||||
results.append((modelfile.name, success, skipped))
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*80}")
|
||||
print("INSTALLATION SUMMARY")
|
||||
print(f"{'='*80}")
|
||||
|
||||
successful = sum(1 for _, success, skipped in results if success and not skipped)
|
||||
skipped = sum(1 for _, success, skip in results if skip)
|
||||
failed = len(results) - successful - skipped
|
||||
|
||||
for name, success, skip in results:
|
||||
if skip:
|
||||
status = "⊘"
|
||||
elif success:
|
||||
status = "✓"
|
||||
else:
|
||||
status = "✗"
|
||||
print(f"{status} {name}")
|
||||
|
||||
print(f"\nTotal: {len(results)} | Successful: {successful} | Skipped: {skipped} | Failed: {failed}")
|
||||
|
||||
if failed > 0:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
184
scripts/model-info.py
Normal file
184
scripts/model-info.py
Normal file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ollama Model Inventory
|
||||
- Parses the official 'Capabilities' section from ollama show
|
||||
- Accurate VRAM estimation
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
def get_cmd_output(cmd: List[str]) -> str:
|
||||
try:
|
||||
# Run command and get stdout
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return result.stdout.strip()
|
||||
except subprocess.CalledProcessError:
|
||||
return ""
|
||||
|
||||
def parse_parameters(param_str: str) -> float:
|
||||
"""Parses '8.0B' or '307M' into standard Billions (float)"""
|
||||
if not param_str or param_str == "N/A": return 0.0
|
||||
clean_val = re.sub(r"[^0-9.]", "", param_str)
|
||||
try:
|
||||
val = float(clean_val)
|
||||
if "M" in param_str.upper(): return val / 1000.0
|
||||
return val
|
||||
except ValueError: return 0.0
|
||||
|
||||
def estimate_vram(params_billions: float, quant: str, context: int, context_used: int) -> str:
|
||||
"""Estimates VRAM usage (Model Weights + Typical KV Cache)."""
|
||||
if params_billions == 0.0: return "N/A"
|
||||
|
||||
# 1. Weights Size (bits per parameter)
|
||||
q_up = quant.upper()
|
||||
if "MXFP4" in q_up or "FP4" in q_up: bpp = 0.55
|
||||
elif "Q8" in q_up: bpp = 1.0
|
||||
elif "Q6" in q_up: bpp = 0.85
|
||||
elif "Q5" in q_up: bpp = 0.75
|
||||
elif "Q4" in q_up: bpp = 0.65
|
||||
elif "Q3" in q_up: bpp = 0.55
|
||||
elif "Q2" in q_up: bpp = 0.45
|
||||
elif "IQ" in q_up: bpp = 0.35 # IQ quantization
|
||||
elif "F16" in q_up or "BF16" in q_up: bpp = 2.0
|
||||
elif "F32" in q_up: bpp = 4.0
|
||||
else: bpp = 0.65 # Default Q4_K_M
|
||||
|
||||
weight_gb = params_billions * bpp
|
||||
|
||||
# 2. KV Cache Size
|
||||
# More accurate formula: context_tokens * embedding_dim * layers * 2 (K+V) * bytes_per_value / 1e9
|
||||
# Simplified: For a typical LLM, ~0.002 GB per 1000 tokens at FP16
|
||||
# Use actual context_used if available, otherwise use a reasonable default (8K)
|
||||
effective_context = context_used if context_used > 0 else min(context, 8192)
|
||||
kv_cache_gb = (effective_context / 1000) * 0.002
|
||||
|
||||
# 3. System Overhead (Ollama runtime, etc.)
|
||||
overhead_gb = 0.3
|
||||
|
||||
total_gb = weight_gb + kv_cache_gb + overhead_gb
|
||||
|
||||
if total_gb < 1: return f"{total_gb * 1024:.0f} MB"
|
||||
return f"{total_gb:.1f} GB"
|
||||
|
||||
def get_model_info(name: str, disk_size: str) -> Dict:
|
||||
try:
|
||||
raw_show = get_cmd_output(['ollama', 'show', name])
|
||||
except Exception as e:
|
||||
return {
|
||||
'model': name,
|
||||
'disk': disk_size,
|
||||
'family': 'ERROR',
|
||||
'params_str': 'N/A',
|
||||
'quant': 'N/A',
|
||||
'context': 0,
|
||||
'context_used': 0,
|
||||
'caps': [],
|
||||
'vram': 'N/A'
|
||||
}
|
||||
|
||||
info = {
|
||||
'model': name,
|
||||
'disk': disk_size,
|
||||
'family': 'N/A',
|
||||
'params_str': 'N/A',
|
||||
'quant': 'N/A',
|
||||
'context': 0,
|
||||
'context_used': 0, # Actual context from Parameters section
|
||||
'caps': []
|
||||
}
|
||||
|
||||
# -- State Machine Parsing --
|
||||
current_section = None
|
||||
|
||||
lines = raw_show.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line: continue
|
||||
|
||||
# Detect Sections
|
||||
if line in ["Model", "Capabilities", "Parameters", "System", "License"]:
|
||||
current_section = line
|
||||
continue
|
||||
|
||||
# Parse 'Model' Section
|
||||
if current_section == "Model":
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
k, v = parts[0].lower(), parts[1].strip()
|
||||
if 'architecture' in k: info['family'] = v
|
||||
elif 'parameters' in k: info['params_str'] = v
|
||||
elif 'quantization' in k: info['quant'] = v
|
||||
elif 'context' in k and 'length' in k:
|
||||
if v.isdigit(): info['context'] = int(v)
|
||||
|
||||
# Fallback regex for context
|
||||
if 'context' in line.lower() and info['context'] == 0:
|
||||
match = re.search(r'context\s+length\s+(\d+)', line, re.IGNORECASE)
|
||||
if match: info['context'] = int(match.group(1))
|
||||
|
||||
# Parse 'Parameters' Section (runtime config)
|
||||
elif current_section == "Parameters":
|
||||
if 'num_ctx' in line.lower():
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2 and parts[1].strip().isdigit():
|
||||
info['context_used'] = int(parts[1].strip())
|
||||
|
||||
# Parse 'Capabilities' Section
|
||||
elif current_section == "Capabilities":
|
||||
cap = line.lower()
|
||||
if cap in ['tools', 'vision', 'thinking', 'insert']:
|
||||
info['caps'].append(cap.capitalize())
|
||||
|
||||
# -- VRAM Calc --
|
||||
p_val = parse_parameters(info['params_str'])
|
||||
info['vram'] = estimate_vram(p_val, info['quant'], info['context'], info['context_used'])
|
||||
|
||||
return info
|
||||
|
||||
def main():
|
||||
print("Fetching Ollama inventory...")
|
||||
list_out = get_cmd_output(['ollama', 'list'])
|
||||
|
||||
data = []
|
||||
lines = list_out.split('\n')[1:]
|
||||
|
||||
for line in lines:
|
||||
if not line.strip(): continue
|
||||
parts = line.split()
|
||||
if len(parts) >= 3:
|
||||
name = parts[0]
|
||||
disk = parts[2]
|
||||
print(f" Analyzing {name}...", end='\r')
|
||||
data.append(get_model_info(name, disk))
|
||||
|
||||
print(" " * 60, end='\r')
|
||||
|
||||
# Formatting Table
|
||||
w = {'m': 38, 'a': 12, 'p': 8, 'q': 10, 'ctx': 12, 'cp': 18, 'd': 8, 'v': 8}
|
||||
|
||||
header = (f"{'MODEL':<{w['m']}} {'ARCH':<{w['a']}} {'PARAMS':<{w['p']}} "
|
||||
f"{'QUANT':<{w['q']}} {'CONTEXT':<{w['ctx']}} {'CAPS':<{w['cp']}} "
|
||||
f"{'DISK':>{w['d']}} {'VRAM':>{w['v']}}")
|
||||
|
||||
print(header)
|
||||
print("-" * len(header))
|
||||
|
||||
for r in data:
|
||||
caps_str = ", ".join(r['caps']) if r['caps'] else "-"
|
||||
# Truncate overly long names
|
||||
d_name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
|
||||
|
||||
# Format context: show used/max or just max if used not set
|
||||
if r['context_used'] > 0:
|
||||
ctx_str = f"{r['context_used']}/{r['context']}"
|
||||
else:
|
||||
ctx_str = str(r['context'])
|
||||
|
||||
print(f"{d_name:<{w['m']}} {r['family']:<{w['a']}} {r['params_str']:<{w['p']}} "
|
||||
f"{r['quant']:<{w['q']}} {ctx_str:<{w['ctx']}} {caps_str:<{w['cp']}} "
|
||||
f"{r['disk']:>{w['d']}} {r['vram']:>{w['v']}}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
164
scripts/ollama-monitor.py
Normal file
164
scripts/ollama-monitor.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ollama Monitor - Real-time dashboard for Ollama instances
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Terminal colors
|
||||
CLEAR, BOLD, RESET = "\033[2J\033[H", "\033[1m", "\033[0m"
|
||||
CYAN, GREEN, YELLOW, MAGENTA, RED = "\033[36m", "\033[32m", "\033[33m", "\033[35m", "\033[31m"
|
||||
|
||||
|
||||
def discover_ollama_instances():
|
||||
"""Auto-discover running Ollama instances."""
|
||||
instances = {}
|
||||
|
||||
# Try default port
|
||||
if check_ollama_available("http://localhost:11434"):
|
||||
instances["Ollama (default)"] = "http://localhost:11434"
|
||||
|
||||
# Try common alternative ports
|
||||
for port in [11435, 11436]:
|
||||
url = f"http://localhost:{port}"
|
||||
if check_ollama_available(url):
|
||||
instances[f"Ollama (port {port})"] = url
|
||||
|
||||
return instances
|
||||
|
||||
|
||||
def check_ollama_available(url):
|
||||
"""Check if an Ollama instance is available at the given URL."""
|
||||
try:
|
||||
with urllib.request.urlopen(f"{url}/api/tags", timeout=1) as r:
|
||||
return r.status == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def get_ollama_ps(url):
|
||||
"""Get running models from Ollama instance."""
|
||||
try:
|
||||
with urllib.request.urlopen(f"{url}/api/ps", timeout=0.5) as r:
|
||||
return json.loads(r.read().decode()).get('models', [])
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def get_gpu_metrics():
|
||||
"""Try to get GPU metrics from AMD GPU sysfs."""
|
||||
try:
|
||||
# Try multiple possible GPU device paths
|
||||
device_paths = [
|
||||
"/sys/class/drm/card1/device/",
|
||||
"/sys/class/drm/card0/device/",
|
||||
]
|
||||
|
||||
for base_path in device_paths:
|
||||
if not os.path.exists(base_path):
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(base_path + "mem_info_vram_used", "r") as f:
|
||||
used = int(f.read().strip()) / 1024 / 1024
|
||||
with open(base_path + "mem_info_vram_total", "r") as f:
|
||||
total = int(f.read().strip()) / 1024 / 1024
|
||||
with open(base_path + "gpu_busy_percent", "r") as f:
|
||||
load = int(f.read().strip())
|
||||
|
||||
# Sanity check: If VRAM usage is low but load is 99%, it's a driver glitch
|
||||
if load == 99 and used < (total * 0.1):
|
||||
load = 0
|
||||
|
||||
return used, total, load
|
||||
except:
|
||||
continue
|
||||
|
||||
return None, None, None
|
||||
except:
|
||||
return None, None, None
|
||||
|
||||
|
||||
def get_sys_metrics():
|
||||
"""Get system CPU and RAM metrics."""
|
||||
try:
|
||||
load_avg = os.getloadavg()[0]
|
||||
mem_output = subprocess.check_output("free -m", shell=True).decode().split('\n')[1].split()
|
||||
ram_used = int(mem_output[2])
|
||||
ram_total = int(mem_output[1])
|
||||
return load_avg, ram_used, ram_total
|
||||
except Exception:
|
||||
return 0.0, 0, 0
|
||||
|
||||
|
||||
def draw(instances):
|
||||
"""Draw the monitoring dashboard."""
|
||||
load_avg, ram_used, ram_total = get_sys_metrics()
|
||||
vram_used, vram_total, gpu_load = get_gpu_metrics()
|
||||
|
||||
out = [f"{CLEAR}{BOLD}{CYAN}=== OLLAMA MONITOR ==={RESET}"]
|
||||
|
||||
# System metrics
|
||||
out.append(f"{BOLD}CPU Load:{RESET} {YELLOW}{load_avg:.2f}{RESET} | "
|
||||
f"{BOLD}RAM:{RESET} {MAGENTA}{ram_used}MB/{ram_total}MB{RESET}", )
|
||||
|
||||
# GPU metrics (if available)
|
||||
if vram_total is not None and gpu_load is not None:
|
||||
load_color = GREEN if gpu_load < 80 else RED
|
||||
out.append(f"{BOLD}GPU Load:{RESET} {load_color}{gpu_load}%{RESET} | "
|
||||
f"{BOLD}VRAM:{RESET} {CYAN}{vram_used:.0f}MB/{vram_total:.0f}MB{RESET}")
|
||||
|
||||
out.append("─" * 70)
|
||||
|
||||
# Ollama instances
|
||||
for name, url in instances.items():
|
||||
models = get_ollama_ps(url)
|
||||
status = f"{GREEN}ONLINE{RESET}" if models is not None else f"{RED}OFFLINE{RESET}"
|
||||
out.append(f"\n{BOLD}{name}{RESET} [{status}] - {url}")
|
||||
|
||||
if models:
|
||||
if len(models) > 0:
|
||||
out.append(f" {'MODEL':<40} {'SIZE':<12} {'UNTIL':<20}")
|
||||
for m in models:
|
||||
size_gb = m.get('size', 0) / (1024**3)
|
||||
until = m.get('expires_at', 'N/A')
|
||||
if until != 'N/A' and 'T' in until:
|
||||
# Parse ISO timestamp and show relative time
|
||||
until = until.split('T')[1].split('.')[0]
|
||||
|
||||
out.append(f" {m['name'][:39]:<40} {size_gb:>6.1f} GB {until}")
|
||||
else:
|
||||
out.append(f" {YELLOW}IDLE{RESET}")
|
||||
elif models is None:
|
||||
out.append(f" {RED}Connection failed{RESET}")
|
||||
|
||||
print("\n".join(out) + f"\n\n{BOLD}{CYAN}Refreshing... (Ctrl+C to quit){RESET}")
|
||||
|
||||
|
||||
def main():
|
||||
print("Discovering Ollama instances...")
|
||||
instances = discover_ollama_instances()
|
||||
|
||||
if not instances:
|
||||
print(f"{RED}✗ No Ollama instances found.{RESET}")
|
||||
print(" Make sure Ollama is running on the default port (11434)")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(instances)} instance(s). Starting monitor...\n")
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
while True:
|
||||
draw(instances)
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
print("\nMonitor stopped.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
274
scripts/vram-test.py
Executable file
274
scripts/vram-test.py
Executable file
@@ -0,0 +1,274 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ollama VRAM Test - Evaluate if models fit in VRAM
|
||||
Tests models with their configured parameters and reports VRAM usage and CPU offloading.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
def get_ollama_url():
|
||||
"""Get the Ollama API URL."""
|
||||
return "http://localhost:11434"
|
||||
|
||||
|
||||
def get_installed_models() -> List[str]:
|
||||
"""Get list of installed Ollama models."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ollama', 'list'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
models = []
|
||||
for line in result.stdout.strip().split('\n')[1:]: # Skip header
|
||||
if line.strip():
|
||||
name = line.split()[0]
|
||||
models.append(name)
|
||||
|
||||
return models
|
||||
except subprocess.CalledProcessError:
|
||||
return []
|
||||
|
||||
|
||||
def get_model_info(model_name: str) -> Dict:
|
||||
"""Get model information from ollama show."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['ollama', 'show', model_name],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
|
||||
info = {
|
||||
'size': 'N/A',
|
||||
'quant': 'N/A',
|
||||
'num_ctx': 'N/A',
|
||||
'params': 'N/A'
|
||||
}
|
||||
|
||||
current_section = None
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line in ["Model", "Parameters"]:
|
||||
current_section = line
|
||||
continue
|
||||
|
||||
if current_section == "Model":
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
k, v = parts[0].lower(), parts[1].strip()
|
||||
if 'quantization' in k:
|
||||
info['quant'] = v
|
||||
elif 'parameters' in k:
|
||||
info['params'] = v
|
||||
|
||||
elif current_section == "Parameters":
|
||||
if 'num_ctx' in line.lower():
|
||||
parts = line.split(maxsplit=1)
|
||||
if len(parts) == 2:
|
||||
info['num_ctx'] = parts[1].strip()
|
||||
|
||||
return info
|
||||
except subprocess.CalledProcessError:
|
||||
return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}
|
||||
|
||||
|
||||
def test_model_vram(model_name: str) -> Dict:
|
||||
"""
|
||||
Test a model's VRAM usage by loading it with a minimal prompt.
|
||||
Returns dict with model stats and VRAM usage.
|
||||
"""
|
||||
print(f"Testing {model_name}...", end=' ', flush=True)
|
||||
|
||||
# Get model info first
|
||||
info = get_model_info(model_name)
|
||||
|
||||
# Send a minimal test prompt to force model loading
|
||||
url = f"{get_ollama_url()}/api/generate"
|
||||
prompt_data = {
|
||||
"model": model_name,
|
||||
"prompt": "Reply with only: OK",
|
||||
"stream": False
|
||||
}
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=json.dumps(prompt_data).encode('utf-8'),
|
||||
headers={'Content-Type': 'application/json'}
|
||||
)
|
||||
|
||||
# Send request and wait for model to load
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
response.read() # Wait for completion
|
||||
|
||||
# Give it a moment to stabilize
|
||||
time.sleep(0.5)
|
||||
|
||||
# Now check /api/ps for VRAM usage
|
||||
ps_url = f"{get_ollama_url()}/api/ps"
|
||||
with urllib.request.urlopen(ps_url, timeout=5) as r:
|
||||
ps_data = json.loads(r.read().decode())
|
||||
models = ps_data.get('models', [])
|
||||
|
||||
# Find our model in the running models
|
||||
for m in models:
|
||||
if m['name'] == model_name or m['name'].startswith(model_name + ':'):
|
||||
size_bytes = m.get('size', 0)
|
||||
size_vram = m.get('size_vram', 0)
|
||||
|
||||
# Calculate VRAM usage in GB
|
||||
vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
|
||||
total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0
|
||||
|
||||
# Calculate offload percentage (how much is on CPU)
|
||||
if size_bytes > 0:
|
||||
offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
|
||||
else:
|
||||
offload_pct = 0
|
||||
|
||||
print("✓")
|
||||
|
||||
return {
|
||||
'model': model_name,
|
||||
'params': info['params'],
|
||||
'size_gb': total_gb,
|
||||
'quant': info['quant'],
|
||||
'num_ctx': info['num_ctx'],
|
||||
'vram_gb': vram_gb,
|
||||
'offload_pct': offload_pct,
|
||||
'success': True
|
||||
}
|
||||
|
||||
# Model not found in ps output
|
||||
print("✗ (not in ps)")
|
||||
return {
|
||||
'model': model_name,
|
||||
'params': info['params'],
|
||||
'size_gb': 0,
|
||||
'quant': info['quant'],
|
||||
'num_ctx': info['num_ctx'],
|
||||
'vram_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'success': False
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ ({str(e)[:30]})")
|
||||
return {
|
||||
'model': model_name,
|
||||
'params': info['params'],
|
||||
'size_gb': 0,
|
||||
'quant': info['quant'],
|
||||
'num_ctx': info['num_ctx'],
|
||||
'vram_gb': 0,
|
||||
'offload_pct': 0,
|
||||
'success': False
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Test Ollama models for VRAM usage and CPU offloading',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Test all installed models
|
||||
%(prog)s
|
||||
|
||||
# Test a specific model
|
||||
%(prog)s ministral-3:3b-instruct-2512-q5_k_m
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'model',
|
||||
nargs='?',
|
||||
help='Specific model to test (optional, tests all if omitted)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if ollama is available
|
||||
try:
|
||||
subprocess.run(['ollama', '--version'], capture_output=True, check=True)
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("✗ Error: 'ollama' command not found. Please install Ollama first.")
|
||||
sys.exit(1)
|
||||
|
||||
# Determine which models to test
|
||||
if args.model:
|
||||
models = [args.model]
|
||||
else:
|
||||
models = get_installed_models()
|
||||
if not models:
|
||||
print("✗ No models found")
|
||||
sys.exit(1)
|
||||
print(f"Found {len(models)} installed model(s)\n")
|
||||
|
||||
# Test each model
|
||||
results = []
|
||||
for model in models:
|
||||
result = test_model_vram(model)
|
||||
results.append(result)
|
||||
|
||||
# Display results table
|
||||
print("\n" + "="*110)
|
||||
print("VRAM USAGE TEST RESULTS")
|
||||
print("="*110)
|
||||
|
||||
# Column widths
|
||||
w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}
|
||||
|
||||
header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
|
||||
f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")
|
||||
|
||||
print(header)
|
||||
print("-" * 110)
|
||||
|
||||
for r in results:
|
||||
# Truncate long model names
|
||||
name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
|
||||
|
||||
# Format values
|
||||
size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
|
||||
vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"
|
||||
|
||||
# Offload status
|
||||
if r['success']:
|
||||
if r['offload_pct'] > 0:
|
||||
offload_str = f"{r['offload_pct']:.1f}% CPU"
|
||||
else:
|
||||
offload_str = "0% (GPU only)"
|
||||
else:
|
||||
offload_str = "Failed"
|
||||
|
||||
print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
|
||||
f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")
|
||||
|
||||
# Summary
|
||||
successful = sum(1 for r in results if r['success'])
|
||||
with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)
|
||||
|
||||
print("\n" + "="*110)
|
||||
print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")
|
||||
|
||||
if with_offload > 0:
|
||||
print(f"\n⚠ {with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user