initial commit

2026-01-18 22:01:50 +01:00
parent ab25613358
commit c40874d7f0
20 changed files with 6287 additions and 11 deletions
--- a/scripts/context-optimizer.py
+++ b/scripts/context-optimizer.py
@@ -0,0 +1,557 @@
+#!/usr/bin/env python3
+"""
+Ollama Context Optimizer - Find optimal num_ctx for models based on VRAM
+Iteratively tests different context sizes to recommend the best setting.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+import urllib.request
+import urllib.error
+from typing import Dict, Optional, Tuple
+
+
+def get_ollama_url():
+    """Get the Ollama API URL."""
+    return "http://localhost:11434"
+
+
+def get_gpu_vram() -> Tuple[Optional[float], Optional[float]]:
+    """Get GPU VRAM total and available in GB."""
+    import os
+    
+    device_paths = [
+        "/sys/class/drm/card1/device/",
+        "/sys/class/drm/card0/device/",
+    ]
+    
+    for base_path in device_paths:
+        if not os.path.exists(base_path):
+            continue
+        
+        try:
+            with open(base_path + "mem_info_vram_used", "r") as f:
+                used = int(f.read().strip()) / 1024 / 1024 / 1024  # GB
+            with open(base_path + "mem_info_vram_total", "r") as f:
+                total = int(f.read().strip()) / 1024 / 1024 / 1024  # GB
+            
+            available = total - used
+            return total, available
+        except:
+            continue
+    
+    return None, None
+
+
+def get_model_info(model_name: str) -> Dict:
+    """Get model information including max context capability."""
+    try:
+        result = subprocess.run(
+            ['ollama', 'show', model_name],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        info = {
+            'max_context': 0,
+            'current_num_ctx': 0,
+            'params': 'N/A',
+            'quant': 'N/A'
+        }
+        
+        current_section = None
+        for line in result.stdout.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+            
+            if line in ["Model", "Parameters"]:
+                current_section = line
+                continue
+            
+            if current_section == "Model":
+                parts = line.split(maxsplit=1)
+                if len(parts) == 2:
+                    k, v = parts[0].lower(), parts[1].strip()
+                    if 'context' in k and 'length' in k:
+                        if v.isdigit():
+                            info['max_context'] = int(v)
+                    elif 'context' in k:
+                        # Handle "context length" as two words
+                        parts2 = line.split()
+                        if len(parts2) >= 3 and parts2[-1].isdigit():
+                            info['max_context'] = int(parts2[-1])
+                    elif 'quantization' in k:
+                        info['quant'] = v
+                    elif 'parameters' in k:
+                        info['params'] = v
+            
+            elif current_section == "Parameters":
+                if 'num_ctx' in line.lower():
+                    parts = line.split(maxsplit=1)
+                    if len(parts) == 2 and parts[1].strip().isdigit():
+                        info['current_num_ctx'] = int(parts[1].strip())
+        
+        return info
+    except subprocess.CalledProcessError:
+        return {'max_context': 0, 'current_num_ctx': 0, 'params': 'N/A', 'quant': 'N/A'}
+
+
+def test_context_size(model_name: str, num_ctx: int) -> Optional[Dict]:
+    """
+    Test a model with a specific context size.
+    Returns VRAM usage and offload info, or None if failed.
+    """
+    url = f"{get_ollama_url()}/api/generate"
+    prompt_data = {
+        "model": model_name,
+        "prompt": "Reply with only: OK",
+        "stream": False,
+        "options": {
+            "num_ctx": num_ctx
+        }
+    }
+    
+    try:
+        req = urllib.request.Request(
+            url,
+            data=json.dumps(prompt_data).encode('utf-8'),
+            headers={'Content-Type': 'application/json'}
+        )
+        
+        # Send request with longer timeout for large contexts
+        # Large contexts can take time to allocate
+        timeout = 60 if num_ctx > 100000 else 30
+        
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            response_data = response.read().decode()
+            
+            # Check if response contains error
+            try:
+                resp_json = json.loads(response_data)
+                if 'error' in resp_json:
+                    error_msg = resp_json['error']
+                    # Return special dict to indicate OOM or other errors
+                    return {
+                        'vram_gb': 0,
+                        'total_gb': 0,
+                        'offload_pct': 0,
+                        'num_ctx': num_ctx,
+                        'error': error_msg
+                    }
+            except:
+                pass
+        
+        # Wait for model to stabilize
+        time.sleep(0.5)
+        
+        # Check /api/ps for VRAM usage
+        ps_url = f"{get_ollama_url()}/api/ps"
+        with urllib.request.urlopen(ps_url, timeout=5) as r:
+            ps_data = json.loads(r.read().decode())
+            models = ps_data.get('models', [])
+            
+            for m in models:
+                if m['name'] == model_name or m['name'].startswith(model_name + ':'):
+                    size_bytes = m.get('size', 0)
+                    size_vram = m.get('size_vram', 0)
+                    
+                    vram_gb = size_vram / (1024**3)
+                    total_gb = size_bytes / (1024**3)
+                    
+                    offload_pct = 0
+                    if size_bytes > 0:
+                        offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
+                    
+                    return {
+                        'vram_gb': vram_gb,
+                        'total_gb': total_gb,
+                        'offload_pct': offload_pct,
+                        'num_ctx': num_ctx
+                    }
+        
+        return None
+        
+    except urllib.error.HTTPError as e:
+        # HTTP errors (500, etc.) - often indicates OOM or model loading failure
+        try:
+            error_body = e.read().decode()
+            error_data = json.loads(error_body)
+            error_msg = error_data.get('error', str(e))
+        except:
+            error_msg = f"HTTP {e.code}"
+        
+        return {
+            'vram_gb': 0,
+            'total_gb': 0,
+            'offload_pct': 0,
+            'num_ctx': num_ctx,
+            'error': error_msg
+        }
+    
+    except urllib.error.URLError as e:
+        # Network/timeout errors
+        if 'timed out' in str(e).lower():
+            error_msg = "Timeout (loading too slow)"
+        else:
+            error_msg = f"Connection error: {e}"
+        
+        return {
+            'vram_gb': 0,
+            'total_gb': 0,
+            'offload_pct': 0,
+            'num_ctx': num_ctx,
+            'error': error_msg
+        }
+    
+    except Exception as e:
+        # Other unexpected errors
+        return {
+            'vram_gb': 0,
+            'total_gb': 0,
+            'offload_pct': 0,
+            'num_ctx': num_ctx,
+            'error': str(e)[:50]
+        }
+
+
+def find_optimal_context(model_name: str, max_turns: Optional[int], overhead_gb: float) -> Dict:
+    """
+    Find optimal context size through intelligent testing.
+    Uses VRAM measurements to extrapolate optimal size.
+    
+    Args:
+        model_name: Name of the Ollama model to test
+        max_turns: Maximum iterations (None = optimize until convergence)
+        overhead_gb: VRAM to keep free for system overhead
+    """
+    print(f"Analyzing model: {model_name}")
+    print("-" * 70)
+    
+    # Get model capabilities
+    info = get_model_info(model_name)
+    max_context = info['max_context']
+    current_ctx = info['current_num_ctx']
+    
+    print(f"Model: {model_name}")
+    print(f"Parameters: {info['params']} ({info['quant']})")
+    print(f"Max context capability: {max_context:,}")
+    print(f"Current num_ctx: {current_ctx:,}")
+    
+    if max_context == 0:
+        print("\n✗ Could not determine model's max context capability")
+        return {}
+    
+    # Get VRAM info
+    vram_total, vram_available = get_gpu_vram()
+    if vram_total:
+        print(f"GPU VRAM: {vram_available:.1f} GB available / {vram_total:.1f} GB total")
+        print(f"Overhead reserved: {overhead_gb:.1f} GB")
+        # Reserve specified overhead
+        target_vram = vram_total - overhead_gb
+    else:
+        print("⚠ Could not detect GPU VRAM (testing will continue)")
+        target_vram = None
+    
+    if max_turns:
+        print(f"Testing with max {max_turns} iterations...")
+    else:
+        print(f"Testing until convergence (num_ctx must be multiple of 2048)...")
+    print()
+    
+    results = []
+    
+    # Turn 1: Test current setting to establish baseline
+    test_ctx = current_ctx if current_ctx > 0 else 8192
+    turn_label = f"Turn 1/{max_turns}" if max_turns else "Turn 1"
+    print(f"{turn_label}: Testing num_ctx={test_ctx:,} (baseline)...", end=' ', flush=True)
+    result = test_context_size(model_name, test_ctx)
+    
+    if result and 'error' not in result:
+        results.append(result)
+        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
+        baseline_vram = result['vram_gb']
+        baseline_ctx = test_ctx
+    else:
+        print("✗ Failed")
+        return {'model': model_name, 'results': results, 'max_context': max_context, 'current_ctx': current_ctx, 'vram_total': vram_total, 'info': info}
+    
+    # Turn 2: Test a higher context to calculate VRAM/context ratio
+    # Try doubling the context or 32K, whichever is smaller
+    test_ctx_2 = min(baseline_ctx * 2, 32768, max_context)
+    if test_ctx_2 <= baseline_ctx:
+        test_ctx_2 = min(baseline_ctx + 16384, max_context)
+    # Round to multiple of 2048
+    test_ctx_2 = (test_ctx_2 // 2048) * 2048
+    
+    turn_label = f"Turn 2/{max_turns}" if max_turns else "Turn 2"
+    print(f"{turn_label}: Testing num_ctx={test_ctx_2:,} (calibration)...", end=' ', flush=True)
+    result = test_context_size(model_name, test_ctx_2)
+    
+    if result and 'error' not in result:
+        results.append(result)
+        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: GPU only")
+        
+        # Calculate VRAM per 1K context tokens
+        vram_diff = result['vram_gb'] - baseline_vram
+        ctx_diff = test_ctx_2 - baseline_ctx
+        if ctx_diff > 0:
+            vram_per_1k_ctx = (vram_diff / ctx_diff) * 1000
+            print(f"         → Estimated VRAM usage: {vram_per_1k_ctx:.4f} GB per 1K context")
+            
+            # Predict optimal context size
+            if target_vram and vram_per_1k_ctx > 0:
+                available_for_ctx = target_vram - baseline_vram
+                estimated_additional_ctx = (available_for_ctx / vram_per_1k_ctx) * 1000
+                predicted_optimal = baseline_ctx + int(estimated_additional_ctx)
+                # Round to multiple of 2048
+                predicted_optimal = (predicted_optimal // 2048) * 2048
+                predicted_optimal = max(baseline_ctx, min(predicted_optimal, max_context))
+                
+                print(f"         → Predicted optimal context: {predicted_optimal:,}")
+            else:
+                predicted_optimal = None
+                vram_per_1k_ctx = None
+        else:
+            vram_per_1k_ctx = None
+            predicted_optimal = None
+    else:
+        if result and 'error' in result:
+            error_msg = result['error']
+            if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
+                print(f"✗ OOM (out of memory)")
+            else:
+                print(f"✗ Error: {error_msg[:30]}")
+        else:
+            print("✗ Failed")
+        vram_per_1k_ctx = None
+        predicted_optimal = None
+    
+    # Remaining turns: Test predicted optimal or use VRAM-based refinement
+    min_ctx = baseline_ctx
+    max_ctx = max_context
+    
+    turn = 2
+    while True:
+        # Check if we should stop
+        if max_turns and turn >= max_turns:
+            break
+        
+        if predicted_optimal and turn == 2:
+            # Turn 3: Test predicted optimal
+            test_ctx = predicted_optimal
+            turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
+            print(f"{turn_label}: Testing num_ctx={test_ctx:,} (predicted optimal)...", end=' ', flush=True)
+        else:
+            # Use VRAM-based prediction if we have the data
+            if vram_per_1k_ctx and target_vram and len(results) > 0:
+                # Find the last successful result (no offload)
+                last_good = None
+                for r in reversed(results):
+                    if r['offload_pct'] == 0:
+                        last_good = r
+                        break
+                
+                if last_good and target_vram:
+                    # Calculate how much more context we can realistically add
+                    available_vram = target_vram - last_good['vram_gb']
+                    
+                    # Calculate potential additional context
+                    additional_ctx = (available_vram / vram_per_1k_ctx) * 1000
+                    
+                    # If we can only add < 8K context, do small increments
+                    if additional_ctx < 8192:
+                        # Small increments - round up to next 2048 boundary
+                        test_ctx = last_good['num_ctx'] + 2048
+                        test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
+                    else:
+                        # Larger headroom - use 60% of predicted to be conservative
+                        test_ctx = last_good['num_ctx'] + int(additional_ctx * 0.6)
+                        test_ctx = (test_ctx // 2048) * 2048
+                        test_ctx = max(min_ctx + 2048, min(test_ctx, max_ctx))
+                else:
+                    # No good result yet - binary search
+                    test_ctx = (min_ctx + max_ctx) // 2
+                    test_ctx = (test_ctx // 2048) * 2048
+            else:
+                # No VRAM data - fall back to binary search
+                test_ctx = (min_ctx + max_ctx) // 2
+                test_ctx = (test_ctx // 2048) * 2048
+            
+            # Avoid retesting same value
+            if any(r['num_ctx'] == test_ctx for r in results):
+                # Adjust by 2048
+                if test_ctx < max_ctx:
+                    test_ctx += 2048
+                else:
+                    test_ctx -= 2048
+                    
+                if test_ctx <= min_ctx or test_ctx >= max_ctx:
+                    print(f"\nConverged after {turn + 1} turns")
+                    break
+            
+            turn_label = f"Turn {turn + 1}/{max_turns}" if max_turns else f"Turn {turn + 1}"
+            print(f"{turn_label}: Testing num_ctx={test_ctx:,}...", end=' ', flush=True)
+        
+        result = test_context_size(model_name, test_ctx)
+        
+        if result is None:
+            print("✗ Failed (model not found)")
+            max_ctx = test_ctx
+            continue
+        
+        if 'error' in result:
+            error_msg = result['error']
+            if 'memory' in error_msg.lower() or 'oom' in error_msg.lower():
+                print(f"✗ OOM (out of memory)")
+            elif 'timeout' in error_msg.lower():
+                print(f"✗ Timeout")
+            else:
+                print(f"✗ Error: {error_msg[:30]}")
+            max_ctx = test_ctx
+            continue
+        
+        results.append(result)
+        
+        offload_str = f"{result['offload_pct']:.1f}% CPU" if result['offload_pct'] > 0 else "GPU only"
+        print(f"✓ VRAM: {result['vram_gb']:.2f} GB, Offload: {offload_str}")
+        
+        # Adjust search bounds
+        if result['offload_pct'] > 0:
+            max_ctx = test_ctx
+        else:
+            min_ctx = test_ctx
+        
+        # Stop if we're converging (within one step of 2048)
+        if max_ctx - min_ctx <= 2048:
+            print(f"\nConverged after {turn + 1} turns")
+            break
+        
+        turn += 1
+    
+    return {
+        'model': model_name,
+        'results': results,
+        'max_context': max_context,
+        'current_ctx': current_ctx,
+        'vram_total': vram_total,
+        'info': info
+    }
+
+
+def print_recommendation(analysis: Dict):
+    """Print optimization recommendations."""
+    if not analysis or not analysis.get('results'):
+        print("\n✗ No results to analyze")
+        return
+    
+    results = analysis['results']
+    max_context = analysis['max_context']
+    current_ctx = analysis['current_ctx']
+    
+    print("\n" + "="*70)
+    print("OPTIMIZATION RECOMMENDATION")
+    print("="*70)
+    
+    # Find best context without offloading
+    no_offload = [r for r in results if r['offload_pct'] == 0]
+    
+    if no_offload:
+        # Recommend highest context without offloading
+        best = max(no_offload, key=lambda x: x['num_ctx'])
+        
+        print(f"\n✓ Recommended num_ctx: {best['num_ctx']:,}")
+        print(f"  VRAM usage: {best['vram_gb']:.2f} GB")
+        print(f"  Status: Fits entirely in GPU memory")
+        
+        if best['num_ctx'] < max_context:
+            print(f"\n⚠ Note: Model supports up to {max_context:,} context")
+            print(f"  but VRAM limits optimal usage to {best['num_ctx']:,}")
+        
+        if current_ctx != best['num_ctx']:
+            print(f"\n📝 Suggested Modelfile change:")
+            print(f"   Current: PARAMETER num_ctx {current_ctx}")
+            print(f"   Optimal: PARAMETER num_ctx {best['num_ctx']}")
+    else:
+        # All tests had offloading
+        print("\n⚠ All tested configurations require CPU offloading")
+        
+        # Find least offloading
+        least_offload = min(results, key=lambda x: x['offload_pct'])
+        
+        print(f"\n  Least offloading at num_ctx={least_offload['num_ctx']:,}")
+        print(f"  CPU offload: {least_offload['offload_pct']:.1f}%")
+        print(f"  VRAM usage: {least_offload['vram_gb']:.2f} GB")
+        
+        print(f"\n💡 Recommendations:")
+        print(f"   1. Use lower quantization (Q4 instead of Q5/Q8)")
+        print(f"   2. Reduce num_ctx to {least_offload['num_ctx']:,} or lower")
+        print(f"   3. Consider a smaller model variant")
+    
+    # VRAM efficiency
+    print(f"\n📊 Tested context sizes:")
+    for r in sorted(results, key=lambda x: x['num_ctx']):
+        status = "✓" if r['offload_pct'] == 0 else "✗"
+        print(f"   {status} {r['num_ctx']:>6,}: {r['vram_gb']:>5.2f} GB VRAM, "
+              f"{r['offload_pct']:>4.1f}% CPU offload")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Optimize Ollama model context size for VRAM constraints',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Optimize with 5 test iterations (default)
+  %(prog)s ministral-3:3b-instruct-2512-q5_k_m
+  
+  # Use 10 iterations for more precise optimization
+  %(prog)s ministral-3:3b-instruct-2512-q5_k_m --turns 10
+        """
+    )
+    
+    parser.add_argument(
+        'model',
+        help='Model name to optimize'
+    )
+    
+    parser.add_argument(
+        '--turns',
+        type=int,
+        default=None,
+        help='Maximum number of test iterations (default: optimize until convergence)'
+    )
+    
+    parser.add_argument(
+        '--overhead',
+        type=float,
+        default=1.0,
+        help='VRAM overhead to keep free in GB (default: 1.0)'
+    )
+    
+    args = parser.parse_args()
+    
+    if args.turns is not None and args.turns < 2:
+        print("✗ Error: --turns must be at least 2")
+        sys.exit(1)
+    
+    # Check if ollama is available
+    try:
+        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
+        sys.exit(1)
+    
+    # Run optimization
+    analysis = find_optimal_context(args.model, args.turns, args.overhead)
+    
+    # Print recommendations
+    print_recommendation(analysis)
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/hf-llm-install.py
+++ b/scripts/hf-llm-install.py
@@ -0,0 +1,638 @@
+#!/usr/bin/env python3
+"""
+HuggingFace LLM Installer for Ollama
+Automatically downloads GGUF files from HuggingFace and creates Ollama models.
+
+Features:
+- SHA256 checksum verification
+- Disk space checking
+- Dry run mode
+- Parallel processing
+- Skip existing models
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from urllib.parse import urlparse
+import urllib.request
+
+
+def parse_model_name_from_gguf(gguf_filename):
+    """
+    Parse model name and tag from GGUF filename.
+    
+    Args:
+        gguf_filename: Name of the GGUF file
+        
+    Returns:
+        Tuple of (model_base, tag, full_name) or (filename, 'latest', filename) if parsing fails
+    """
+    filename_stem = Path(gguf_filename).stem.lower()
+    
+    # Split on hyphens
+    parts = filename_stem.split('-')
+    if len(parts) >= 3:
+        # Find where the size variant starts (e.g., "0.5b", "3b", "8b", "14b")
+        base_parts = []
+        tag_parts = []
+        found_variant = False
+        
+        for part in parts:
+            # Check if this looks like a size variant (e.g., "3b", "8b", "0.5b")
+            if not found_variant and re.match(r'^\d+(\.\d+)?b$', part):
+                found_variant = True
+                tag_parts.append(part)
+            elif found_variant:
+                # Include everything after the variant (including quantization)
+                tag_parts.append(part)
+            else:
+                # Before the variant = base name
+                base_parts.append(part)
+        
+        if base_parts and tag_parts:
+            model_base = '-'.join(base_parts)
+            model_tag = '-'.join(tag_parts)
+            full_name = f"{model_base}:{model_tag}"
+            return (model_base, model_tag, full_name)
+    
+    # Fallback to filename without extension
+    return (filename_stem, 'latest', filename_stem)
+
+
+def parse_modelfile(modelfile_path):
+    """
+    Parse a Modelfile to extract HuggingFace upstream URL and model info.
+    
+    Args:
+        modelfile_path: Path to the .Modelfile
+        
+    Returns:
+        dict with model metadata or None if invalid
+    """
+    with open(modelfile_path, 'r') as f:
+        content = f.read()
+    
+    # Look for hf_upstream in the header comments
+    hf_match = re.search(r'#\s*hf_upstream:\s*(https://huggingface\.co/[^\s]+)', content)
+    if not hf_match:
+        return None
+    
+    hf_url = hf_match.group(1)
+    
+    # Look for optional quantization specification (default: q4_k_m)
+    quant_match = re.search(r'#\s*quantization:\s*([a-zA-Z0-9_]+)', content)
+    quantization = quant_match.group(1).upper() if quant_match else 'Q4_K_M'
+    
+    # Look for optional SHA256 checksum
+    sha256_match = re.search(r'#\s*sha256:\s*([a-fA-F0-9]{64})', content)
+    sha256 = sha256_match.group(1) if sha256_match else None
+    
+    # Look for optional capabilities (comma-separated list)
+    # Format: # capabilities: tools, vision
+    capabilities_match = re.search(r'#\s*capabilities:\s*([^\n]+)', content)
+    capabilities = None
+    if capabilities_match:
+        # Parse comma-separated capabilities and clean whitespace
+        caps_str = capabilities_match.group(1).strip()
+        capabilities = [cap.strip() for cap in caps_str.split(',') if cap.strip()]
+    
+    # Check if URL points to a specific GGUF file or just the repo
+    if hf_url.endswith('.gguf') or '/blob/' in hf_url or '/resolve/' in hf_url:
+        # Specific file provided - use as-is
+        resolve_url = hf_url.replace('/blob/', '/resolve/')
+        gguf_filename = os.path.basename(urlparse(resolve_url).path)
+    else:
+        # Repository root provided - construct filename from repo name and quantization
+        # URL format: https://huggingface.co/{org}/{repo}
+        url_parts = urlparse(hf_url).path.strip('/').split('/')
+        if len(url_parts) >= 2:
+            repo_name = url_parts[1]  # e.g., "Ministral-3-3B-Instruct-2512-GGUF"
+            
+            # Remove -GGUF suffix if present (case-insensitive)
+            if repo_name.upper().endswith('-GGUF'):
+                repo_name = repo_name[:-5]
+            
+            # Construct filename: RepoName-Quantization.gguf
+            gguf_filename = f"{repo_name}-{quantization}.gguf"
+            resolve_url = f"{hf_url.rstrip('/')}/resolve/main/{gguf_filename}"
+        else:
+            print(f"✗ Invalid HuggingFace URL format: {hf_url}")
+            return None
+    
+    # Extract model name and tag from the GGUF filename
+    # Format: Model-Version-Variant-Year-Quant.gguf -> model:version-variant-year-quant
+    # Example: Ministral-3-3B-Instruct-2512-Q5_K_M.gguf -> ministral-3:3b-instruct-2512-q5_k_m
+    model_base, model_tag, model_name = parse_model_name_from_gguf(gguf_filename)
+    
+    return {
+        'hf_url': hf_url,
+        'resolve_url': resolve_url,
+        'gguf_filename': gguf_filename,
+        'model_name': model_name,
+        'modelfile_path': modelfile_path,
+        'sha256': sha256,
+        'capabilities': capabilities
+    }
+
+
+def get_file_size(url):
+    """
+    Get the size of a file from URL without downloading it.
+    
+    Args:
+        url: File URL
+        
+    Returns:
+        Size in bytes or None if unavailable
+    """
+    try:
+        req = urllib.request.Request(url, method='HEAD')
+        with urllib.request.urlopen(req, timeout=10) as response:
+            size = response.headers.get('Content-Length')
+            return int(size) if size else None
+    except Exception:
+        return None
+
+
+def check_disk_space(required_bytes, path='.'):
+    """
+    Check if there's enough disk space available.
+    
+    Args:
+        required_bytes: Required space in bytes
+        path: Path to check space on (default: current directory)
+        
+    Returns:
+        Tuple of (has_space, available_bytes, required_bytes)
+    """
+    # Get absolute path to check actual filesystem
+    abs_path = os.path.abspath(path)
+    stat = shutil.disk_usage(abs_path)
+    # Add 10% safety margin
+    required_with_margin = int(required_bytes * 1.1)
+    return (stat.free >= required_with_margin, stat.free, required_with_margin)
+
+
+def calculate_sha256(filepath, chunk_size=8192):
+    """
+    Calculate SHA256 checksum of a file.
+    
+    Args:
+        filepath: Path to file
+        chunk_size: Bytes to read at once
+        
+    Returns:
+        SHA256 hex digest
+    """
+    sha256_hash = hashlib.sha256()
+    with open(filepath, 'rb') as f:
+        for chunk in iter(lambda: f.read(chunk_size), b''):
+            sha256_hash.update(chunk)
+    return sha256_hash.hexdigest()
+
+
+def verify_checksum(filepath, expected_sha256):
+    """
+    Verify file checksum matches expected value.
+    
+    Args:
+        filepath: Path to file
+        expected_sha256: Expected SHA256 hash
+        
+    Returns:
+        True if match, False otherwise
+    """
+    print(f"  Verifying checksum...")
+    actual = calculate_sha256(filepath)
+    
+    if actual.lower() == expected_sha256.lower():
+        print(f"  ✓ Checksum verified: {actual[:16]}...")
+        return True
+    else:
+        print(f"  ✗ Checksum mismatch!")
+        print(f"    Expected: {expected_sha256}")
+        print(f"    Actual:   {actual}")
+        return False
+
+
+def get_existing_models():
+    """
+    Get list of existing Ollama models.
+    
+    Returns:
+        Set of model names
+    """
+    try:
+        result = subprocess.run(
+            ['ollama', 'list'],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        # Parse output to get model names
+        # Format: NAME                    ID              SIZE      MODIFIED
+        models = set()
+        for line in result.stdout.strip().split('\n')[1:]:  # Skip header
+            if line.strip():
+                # Get first column (name)
+                name = line.split()[0]
+                # Remove tag if present
+                base_name = name.split(':')[0]
+                models.add(base_name)
+        
+        return models
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return set()
+
+
+def download_file(url, dest_path, filename, should_cancel=None, progress_callback=None):
+    """
+    Download a file from URL to destination with progress indication.
+    
+    Args:
+        url: Source URL
+        dest_path: Destination file path
+        filename: Name for display purposes
+        should_cancel: Optional callback function that returns True if download should be cancelled
+        progress_callback: Optional callback function to report progress messages
+    """
+    def log(msg):
+        """Helper to print and optionally call progress callback."""
+        print(msg)
+        if progress_callback:
+            progress_callback(msg)
+    
+    log(f"Downloading {filename}...")
+    log(f"  From: {url}")
+    log(f"  To: {dest_path}")
+    
+    def show_progress(block_num, block_size, total_size):
+        # Check for cancellation
+        if should_cancel and should_cancel():
+            raise InterruptedError("Download cancelled")
+        
+        downloaded = block_num * block_size
+        if total_size > 0:
+            percent = min(100, downloaded * 100 / total_size)
+            mb_downloaded = downloaded / (1024 * 1024)
+            mb_total = total_size / (1024 * 1024)
+            msg = f"\r  Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)"
+            print(msg, end='')
+            if progress_callback:
+                progress_callback(f"Progress: {percent:.1f}% ({mb_downloaded:.1f}/{mb_total:.1f} MB)")
+    
+    try:
+        urllib.request.urlretrieve(url, dest_path, show_progress)
+        print()  # New line after progress
+        log(f"✓ Download complete")
+    except Exception as e:
+        print(f"\n✗ Download failed: {e}")
+        if progress_callback:
+            progress_callback(f"✗ Download failed: {e}")
+        raise
+
+
+def create_ollama_model(modelfile_path, gguf_path, model_name, capabilities=None):
+    """
+    Create an Ollama model from the Modelfile and GGUF file.
+    
+    Args:
+        modelfile_path: Path to the .Modelfile
+        gguf_path: Path to the downloaded GGUF file
+        model_name: Name for the Ollama model
+        capabilities: Optional list of capabilities to add (e.g., ['tools', 'vision'])
+    """
+    print(f"\nCreating Ollama model: {model_name}")
+    
+    # Note: Capabilities are detected from the GGUF file metadata by Ollama automatically
+    if capabilities:
+        print(f"  ℹ Expected capabilities from GGUF metadata: {', '.join(capabilities)}")
+    
+    # Read the Modelfile and update the FROM path to point to the downloaded GGUF
+    with open(modelfile_path, 'r') as f:
+        modelfile_content = f.read()
+    
+    # Replace the FROM line to use the actual GGUF path
+    # Handle both relative paths like "./filename.gguf" and URLs like "https://..."
+    original_content = modelfile_content
+    modelfile_content = re.sub(
+        r'FROM\s+(?:\./[^\s]+\.gguf|https?://[^\n]+)',
+        f'FROM {gguf_path}',
+        modelfile_content
+    )
+    
+    # Debug: check if replacement happened
+    if original_content == modelfile_content:
+        print(f"  WARNING: FROM line was not replaced!")
+        print(f"  Looking for pattern in: {original_content[:200]}")
+    else:
+        print(f"  ✓ Replaced FROM line with local path: {gguf_path}")
+    
+    # Create a temporary Modelfile with the correct path
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.Modelfile', delete=False) as tmp_modelfile:
+        tmp_modelfile.write(modelfile_content)
+        tmp_modelfile_path = tmp_modelfile.name
+    
+    try:
+        # Run ollama create
+        cmd = ['ollama', 'create', model_name, '-f', tmp_modelfile_path]
+        print(f"  Running: {' '.join(cmd)}")
+        
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True
+        )
+        
+        if result.returncode == 0:
+            print(f"✓ Model '{model_name}' created successfully")
+            if result.stdout:
+                print(f"  {result.stdout.strip()}")
+        else:
+            print(f"✗ Failed to create model")
+            if result.stderr:
+                print(f"  Error: {result.stderr.strip()}")
+            raise subprocess.CalledProcessError(result.returncode, cmd)
+            
+    finally:
+        # Clean up temporary Modelfile
+        os.unlink(tmp_modelfile_path)
+
+
+def install_model(modelfile_path, dry_run=False, skip_existing=False, existing_models=None, should_cancel=None, progress_callback=None):
+    """
+    Install a single model from a Modelfile.
+    
+    Args:
+        modelfile_path: Path to the .Modelfile
+        dry_run: If True, only simulate installation
+        skip_existing: If True, skip models already in Ollama
+        existing_models: Set of existing model names
+        should_cancel: Optional callback function that returns True if installation should be cancelled
+        progress_callback: Optional callback function to report progress messages
+        
+    Returns:
+        Tuple of (success: bool, skipped: bool, model_name: str)
+    """
+    def log(msg):
+        """Helper to print and optionally call progress callback."""
+        print(msg)
+        if progress_callback:
+            progress_callback(msg)
+    log(f"\n{'='*80}")
+    log(f"Processing: {modelfile_path}")
+    log(f"{'='*80}")
+    
+    # Parse the Modelfile
+    model_info = parse_modelfile(modelfile_path)
+    if not model_info:
+        log(f"✗ No hf_upstream found in {modelfile_path}")
+        return (False, False, None)
+    
+    log(f"Model name: {model_info['model_name']}")
+    log(f"GGUF file: {model_info['gguf_filename']}")
+    if model_info['sha256']:
+        log(f"SHA256: {model_info['sha256'][:16]}...")
+    if model_info.get('capabilities'):
+        log(f"Capabilities: {', '.join(model_info['capabilities'])}")
+    
+    # Check if model already exists
+    if skip_existing and existing_models and model_info['model_name'] in existing_models:
+        log(f"⊘ Model '{model_info['model_name']}' already exists, skipping")
+        return (True, True, model_info['model_name'])
+    
+    # Get file size and check disk space
+    file_size = get_file_size(model_info['resolve_url'])
+    if file_size:
+        size_gb = file_size / (1024**3)
+        log(f"File size: {size_gb:.2f} GB")
+        
+        if not dry_run:
+            has_space, available, required = check_disk_space(file_size)
+            if not has_space:
+                log(f"✗ Insufficient disk space!")
+                log(f"  Required: {required / (1024**3):.2f} GB (with 10% margin)")
+                log(f"  Available: {available / (1024**3):.2f} GB")
+                return (False, False, model_info['model_name'])
+            else:
+                log(f"✓ Disk space check passed ({available / (1024**3):.2f} GB available)")
+    
+    if dry_run:
+        log(f"\n[DRY RUN] Would download and install model: {model_info['model_name']}")
+        return (True, False, model_info['model_name'])
+    
+    # Create temporary directory for download
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        gguf_path = os.path.join(tmp_dir, model_info['gguf_filename'])
+        
+        try:
+            # Download the GGUF file
+            download_file(model_info['resolve_url'], gguf_path, model_info['gguf_filename'], should_cancel, progress_callback)
+            
+            # Verify checksum if provided
+            if model_info['sha256']:
+                if not verify_checksum(gguf_path, model_info['sha256']):
+                    print(f"✗ Checksum verification failed!")
+                    return (False, False, model_info['model_name'])
+            
+            # Create the Ollama model
+            create_ollama_model(
+                modelfile_path,
+                gguf_path,
+                model_info['model_name'],
+                model_info.get('capabilities')
+            )
+            
+            print(f"\n✓ Successfully installed model: {model_info['model_name']}")
+            return (True, False, model_info['model_name'])
+            
+        except Exception as e:
+            print(f"\n✗ Failed to install model: {e}")
+            return (False, False, model_info['model_name'])
+
+
+def install_model_wrapper(args):
+    """Wrapper for parallel execution."""
+    return install_model(*args)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Install Ollama models from HuggingFace using Modelfiles',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Install a single model
+  %(prog)s path/to/model.Modelfile
+  
+  # Install all models in the default repo directory
+  %(prog)s
+  
+  # Dry run to see what would be installed
+  %(prog)s --dry-run
+  
+  # Skip models that already exist
+  %(prog)s --skip-existing
+  
+  # Install with 3 parallel downloads
+  %(prog)s --parallel 3
+        """
+    )
+    
+    parser.add_argument(
+        'modelfile',
+        nargs='?',
+        help='Path to a specific .Modelfile to install (optional)'
+    )
+    
+    parser.add_argument(
+        '--dir',
+        default='./modelfile-repo',
+        help='Directory containing .Modelfile files (default: ./modelfile-repo)'
+    )
+    
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Simulate installation without downloading or creating models'
+    )
+    
+    parser.add_argument(
+        '--skip-existing',
+        action='store_true',
+        help='Skip models that already exist in Ollama'
+    )
+    
+    parser.add_argument(
+        '--parallel',
+        type=int,
+        default=1,
+        metavar='N',
+        help='Number of parallel downloads/installations (default: 1)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate parallel argument
+    if args.parallel < 1:
+        print("✗ Error: --parallel must be at least 1")
+        sys.exit(1)
+    
+    # Check if ollama is available
+    try:
+        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
+        print("  Visit: https://ollama.ai")
+        sys.exit(1)
+    
+    # Get existing models if skip_existing is enabled
+    existing_models = None
+    if args.skip_existing:
+        existing_models = get_existing_models()
+        if existing_models:
+            print(f"Found {len(existing_models)} existing model(s)")
+    
+    # Determine which Modelfiles to process
+    if args.modelfile:
+        # Single file mode
+        modelfile_path = Path(args.modelfile)
+        if not modelfile_path.exists():
+            print(f"✗ Error: File not found: {modelfile_path}")
+            sys.exit(1)
+        
+        if not modelfile_path.suffix == '.Modelfile':
+            print(f"✗ Error: File must have .Modelfile extension")
+            sys.exit(1)
+        
+        modelfiles = [modelfile_path]
+    else:
+        # Batch mode - process all .Modelfile files in directory
+        modelfile_dir = Path(args.dir)
+        if not modelfile_dir.exists():
+            print(f"✗ Error: Directory not found: {modelfile_dir}")
+            sys.exit(1)
+        
+        modelfiles = sorted(modelfile_dir.glob('*.Modelfile'))
+        if not modelfiles:
+            print(f"✗ No .Modelfile files found in {modelfile_dir}")
+            sys.exit(1)
+        
+        print(f"Found {len(modelfiles)} Modelfile(s) to process")
+    
+    if args.dry_run:
+        print("\n*** DRY RUN MODE - No files will be downloaded or models created ***\n")
+    
+    # Process all Modelfiles
+    results = []
+    
+    if args.parallel > 1 and len(modelfiles) > 1:
+        # Parallel processing
+        print(f"\nUsing {args.parallel} parallel worker(s)")
+        
+        with ThreadPoolExecutor(max_workers=args.parallel) as executor:
+            # Submit all tasks
+            future_to_modelfile = {
+                executor.submit(
+                    install_model_wrapper,
+                    (modelfile, args.dry_run, args.skip_existing, existing_models)
+                ): modelfile
+                for modelfile in modelfiles
+            }
+            
+            # Collect results as they complete
+            for future in as_completed(future_to_modelfile):
+                modelfile = future_to_modelfile[future]
+                try:
+                    success, skipped, model_name = future.result()
+                    results.append((modelfile.name, success, skipped))
+                except Exception as e:
+                    print(f"\n✗ Exception processing {modelfile.name}: {e}")
+                    results.append((modelfile.name, False, False))
+    else:
+        # Sequential processing
+        for modelfile in modelfiles:
+            success, skipped, model_name = install_model(
+                modelfile,
+                args.dry_run,
+                args.skip_existing,
+                existing_models
+            )
+            results.append((modelfile.name, success, skipped))
+    
+    # Summary
+    print(f"\n{'='*80}")
+    print("INSTALLATION SUMMARY")
+    print(f"{'='*80}")
+    
+    successful = sum(1 for _, success, skipped in results if success and not skipped)
+    skipped = sum(1 for _, success, skip in results if skip)
+    failed = len(results) - successful - skipped
+    
+    for name, success, skip in results:
+        if skip:
+            status = "⊘"
+        elif success:
+            status = "✓"
+        else:
+            status = "✗"
+        print(f"{status} {name}")
+    
+    print(f"\nTotal: {len(results)} | Successful: {successful} | Skipped: {skipped} | Failed: {failed}")
+    
+    if failed > 0:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/model-info.py
+++ b/scripts/model-info.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Ollama Model Inventory
+- Parses the official 'Capabilities' section from ollama show
+- Accurate VRAM estimation
+"""
+
+import subprocess
+import re
+from typing import Dict, List
+
+def get_cmd_output(cmd: List[str]) -> str:
+    try:
+        # Run command and get stdout
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        return result.stdout.strip()
+    except subprocess.CalledProcessError:
+        return ""
+
+def parse_parameters(param_str: str) -> float:
+    """Parses '8.0B' or '307M' into standard Billions (float)"""
+    if not param_str or param_str == "N/A": return 0.0
+    clean_val = re.sub(r"[^0-9.]", "", param_str)
+    try:
+        val = float(clean_val)
+        if "M" in param_str.upper(): return val / 1000.0
+        return val
+    except ValueError: return 0.0
+
+def estimate_vram(params_billions: float, quant: str, context: int, context_used: int) -> str:
+    """Estimates VRAM usage (Model Weights + Typical KV Cache)."""
+    if params_billions == 0.0: return "N/A"
+
+    # 1. Weights Size (bits per parameter)
+    q_up = quant.upper()
+    if "MXFP4" in q_up or "FP4" in q_up: bpp = 0.55
+    elif "Q8" in q_up: bpp = 1.0
+    elif "Q6" in q_up: bpp = 0.85
+    elif "Q5" in q_up: bpp = 0.75
+    elif "Q4" in q_up: bpp = 0.65
+    elif "Q3" in q_up: bpp = 0.55
+    elif "Q2" in q_up: bpp = 0.45
+    elif "IQ" in q_up: bpp = 0.35  # IQ quantization
+    elif "F16" in q_up or "BF16" in q_up: bpp = 2.0
+    elif "F32" in q_up: bpp = 4.0
+    else: bpp = 0.65  # Default Q4_K_M
+
+    weight_gb = params_billions * bpp
+
+    # 2. KV Cache Size
+    # More accurate formula: context_tokens * embedding_dim * layers * 2 (K+V) * bytes_per_value / 1e9
+    # Simplified: For a typical LLM, ~0.002 GB per 1000 tokens at FP16
+    # Use actual context_used if available, otherwise use a reasonable default (8K)
+    effective_context = context_used if context_used > 0 else min(context, 8192)
+    kv_cache_gb = (effective_context / 1000) * 0.002
+    
+    # 3. System Overhead (Ollama runtime, etc.)
+    overhead_gb = 0.3
+
+    total_gb = weight_gb + kv_cache_gb + overhead_gb
+    
+    if total_gb < 1: return f"{total_gb * 1024:.0f} MB"
+    return f"{total_gb:.1f} GB"
+
+def get_model_info(name: str, disk_size: str) -> Dict:
+    try:
+        raw_show = get_cmd_output(['ollama', 'show', name])
+    except Exception as e:
+        return {
+            'model': name,
+            'disk': disk_size,
+            'family': 'ERROR',
+            'params_str': 'N/A',
+            'quant': 'N/A',
+            'context': 0,
+            'context_used': 0,
+            'caps': [],
+            'vram': 'N/A'
+        }
+    
+    info = {
+        'model': name,
+        'disk': disk_size,
+        'family': 'N/A',
+        'params_str': 'N/A',
+        'quant': 'N/A',
+        'context': 0,
+        'context_used': 0,  # Actual context from Parameters section
+        'caps': []
+    }
+
+    # -- State Machine Parsing --
+    current_section = None
+    
+    lines = raw_show.split('\n')
+    for line in lines:
+        line = line.strip()
+        if not line: continue
+        
+        # Detect Sections
+        if line in ["Model", "Capabilities", "Parameters", "System", "License"]:
+            current_section = line
+            continue
+
+        # Parse 'Model' Section
+        if current_section == "Model":
+            parts = line.split(maxsplit=1)
+            if len(parts) == 2:
+                k, v = parts[0].lower(), parts[1].strip()
+                if 'architecture' in k: info['family'] = v
+                elif 'parameters' in k: info['params_str'] = v
+                elif 'quantization' in k: info['quant'] = v
+                elif 'context' in k and 'length' in k:
+                    if v.isdigit(): info['context'] = int(v)
+            
+            # Fallback regex for context
+            if 'context' in line.lower() and info['context'] == 0:
+                match = re.search(r'context\s+length\s+(\d+)', line, re.IGNORECASE)
+                if match: info['context'] = int(match.group(1))
+
+        # Parse 'Parameters' Section (runtime config)
+        elif current_section == "Parameters":
+            if 'num_ctx' in line.lower():
+                parts = line.split(maxsplit=1)
+                if len(parts) == 2 and parts[1].strip().isdigit():
+                    info['context_used'] = int(parts[1].strip())
+
+        # Parse 'Capabilities' Section
+        elif current_section == "Capabilities":
+            cap = line.lower()
+            if cap in ['tools', 'vision', 'thinking', 'insert']:
+                info['caps'].append(cap.capitalize())
+
+    # -- VRAM Calc --
+    p_val = parse_parameters(info['params_str'])
+    info['vram'] = estimate_vram(p_val, info['quant'], info['context'], info['context_used'])
+
+    return info
+
+def main():
+    print("Fetching Ollama inventory...")
+    list_out = get_cmd_output(['ollama', 'list'])
+    
+    data = []
+    lines = list_out.split('\n')[1:]
+    
+    for line in lines:
+        if not line.strip(): continue
+        parts = line.split()
+        if len(parts) >= 3:
+            name = parts[0]
+            disk = parts[2]
+            print(f"   Analyzing {name}...", end='\r')
+            data.append(get_model_info(name, disk))
+            
+    print(" " * 60, end='\r')
+
+    # Formatting Table
+    w = {'m': 38, 'a': 12, 'p': 8, 'q': 10, 'ctx': 12, 'cp': 18, 'd': 8, 'v': 8}
+    
+    header = (f"{'MODEL':<{w['m']}} {'ARCH':<{w['a']}} {'PARAMS':<{w['p']}} "
+              f"{'QUANT':<{w['q']}} {'CONTEXT':<{w['ctx']}} {'CAPS':<{w['cp']}} "
+              f"{'DISK':>{w['d']}} {'VRAM':>{w['v']}}")
+    
+    print(header)
+    print("-" * len(header))
+
+    for r in data:
+        caps_str = ", ".join(r['caps']) if r['caps'] else "-"
+        # Truncate overly long names
+        d_name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
+        
+        # Format context: show used/max or just max if used not set
+        if r['context_used'] > 0:
+            ctx_str = f"{r['context_used']}/{r['context']}"
+        else:
+            ctx_str = str(r['context'])
+        
+        print(f"{d_name:<{w['m']}} {r['family']:<{w['a']}} {r['params_str']:<{w['p']}} "
+              f"{r['quant']:<{w['q']}} {ctx_str:<{w['ctx']}} {caps_str:<{w['cp']}} "
+              f"{r['disk']:>{w['d']}} {r['vram']:>{w['v']}}")
+
+if __name__ == "__main__":
+    main()
--- a/scripts/ollama-monitor.py
+++ b/scripts/ollama-monitor.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Ollama Monitor - Real-time dashboard for Ollama instances
+"""
+
+import urllib.request
+import json
+import subprocess
+import time
+import os
+import sys
+
+# Terminal colors
+CLEAR, BOLD, RESET = "\033[2J\033[H", "\033[1m", "\033[0m"
+CYAN, GREEN, YELLOW, MAGENTA, RED = "\033[36m", "\033[32m", "\033[33m", "\033[35m", "\033[31m"
+
+
+def discover_ollama_instances():
+    """Auto-discover running Ollama instances."""
+    instances = {}
+    
+    # Try default port
+    if check_ollama_available("http://localhost:11434"):
+        instances["Ollama (default)"] = "http://localhost:11434"
+    
+    # Try common alternative ports
+    for port in [11435, 11436]:
+        url = f"http://localhost:{port}"
+        if check_ollama_available(url):
+            instances[f"Ollama (port {port})"] = url
+    
+    return instances
+
+
+def check_ollama_available(url):
+    """Check if an Ollama instance is available at the given URL."""
+    try:
+        with urllib.request.urlopen(f"{url}/api/tags", timeout=1) as r:
+            return r.status == 200
+    except:
+        return False
+
+
+def get_ollama_ps(url):
+    """Get running models from Ollama instance."""
+    try:
+        with urllib.request.urlopen(f"{url}/api/ps", timeout=0.5) as r:
+            return json.loads(r.read().decode()).get('models', [])
+    except Exception:
+        return None
+
+
+def get_gpu_metrics():
+    """Try to get GPU metrics from AMD GPU sysfs."""
+    try:
+        # Try multiple possible GPU device paths
+        device_paths = [
+            "/sys/class/drm/card1/device/",
+            "/sys/class/drm/card0/device/",
+        ]
+        
+        for base_path in device_paths:
+            if not os.path.exists(base_path):
+                continue
+                
+            try:
+                with open(base_path + "mem_info_vram_used", "r") as f:
+                    used = int(f.read().strip()) / 1024 / 1024
+                with open(base_path + "mem_info_vram_total", "r") as f:
+                    total = int(f.read().strip()) / 1024 / 1024
+                with open(base_path + "gpu_busy_percent", "r") as f:
+                    load = int(f.read().strip())
+                
+                # Sanity check: If VRAM usage is low but load is 99%, it's a driver glitch
+                if load == 99 and used < (total * 0.1):
+                    load = 0
+                    
+                return used, total, load
+            except:
+                continue
+                
+        return None, None, None
+    except:
+        return None, None, None
+
+
+def get_sys_metrics():
+    """Get system CPU and RAM metrics."""
+    try:
+        load_avg = os.getloadavg()[0]
+        mem_output = subprocess.check_output("free -m", shell=True).decode().split('\n')[1].split()
+        ram_used = int(mem_output[2])
+        ram_total = int(mem_output[1])
+        return load_avg, ram_used, ram_total
+    except Exception:
+        return 0.0, 0, 0
+
+
+def draw(instances):
+    """Draw the monitoring dashboard."""
+    load_avg, ram_used, ram_total = get_sys_metrics()
+    vram_used, vram_total, gpu_load = get_gpu_metrics()
+
+    out = [f"{CLEAR}{BOLD}{CYAN}=== OLLAMA MONITOR ==={RESET}"]
+    
+    # System metrics
+    out.append(f"{BOLD}CPU Load:{RESET} {YELLOW}{load_avg:.2f}{RESET} | "
+               f"{BOLD}RAM:{RESET} {MAGENTA}{ram_used}MB/{ram_total}MB{RESET}", )
+    
+    # GPU metrics (if available)
+    if vram_total is not None and gpu_load is not None:
+        load_color = GREEN if gpu_load < 80 else RED
+        out.append(f"{BOLD}GPU Load:{RESET} {load_color}{gpu_load}%{RESET} | "
+                   f"{BOLD}VRAM:{RESET} {CYAN}{vram_used:.0f}MB/{vram_total:.0f}MB{RESET}")
+    
+    out.append("─" * 70)
+
+    # Ollama instances
+    for name, url in instances.items():
+        models = get_ollama_ps(url)
+        status = f"{GREEN}ONLINE{RESET}" if models is not None else f"{RED}OFFLINE{RESET}"
+        out.append(f"\n{BOLD}{name}{RESET} [{status}] - {url}")
+        
+        if models:
+            if len(models) > 0:
+                out.append(f"  {'MODEL':<40} {'SIZE':<12} {'UNTIL':<20}")
+                for m in models:
+                    size_gb = m.get('size', 0) / (1024**3)
+                    until = m.get('expires_at', 'N/A')
+                    if until != 'N/A' and 'T' in until:
+                        # Parse ISO timestamp and show relative time
+                        until = until.split('T')[1].split('.')[0]
+                    
+                    out.append(f"  {m['name'][:39]:<40} {size_gb:>6.1f} GB   {until}")
+            else:
+                out.append(f"  {YELLOW}IDLE{RESET}")
+        elif models is None:
+            out.append(f"  {RED}Connection failed{RESET}")
+
+    print("\n".join(out) + f"\n\n{BOLD}{CYAN}Refreshing... (Ctrl+C to quit){RESET}")
+
+
+def main():
+    print("Discovering Ollama instances...")
+    instances = discover_ollama_instances()
+    
+    if not instances:
+        print(f"{RED}✗ No Ollama instances found.{RESET}")
+        print("  Make sure Ollama is running on the default port (11434)")
+        sys.exit(1)
+    
+    print(f"Found {len(instances)} instance(s). Starting monitor...\n")
+    time.sleep(1)
+    
+    try:
+        while True:
+            draw(instances)
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\nMonitor stopped.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/vram-test.py
+++ b/scripts/vram-test.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+"""
+Ollama VRAM Test - Evaluate if models fit in VRAM
+Tests models with their configured parameters and reports VRAM usage and CPU offloading.
+"""
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+import urllib.request
+from typing import Dict, List, Optional
+
+
+def get_ollama_url():
+    """Get the Ollama API URL."""
+    return "http://localhost:11434"
+
+
+def get_installed_models() -> List[str]:
+    """Get list of installed Ollama models."""
+    try:
+        result = subprocess.run(
+            ['ollama', 'list'],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        models = []
+        for line in result.stdout.strip().split('\n')[1:]:  # Skip header
+            if line.strip():
+                name = line.split()[0]
+                models.append(name)
+        
+        return models
+    except subprocess.CalledProcessError:
+        return []
+
+
+def get_model_info(model_name: str) -> Dict:
+    """Get model information from ollama show."""
+    try:
+        result = subprocess.run(
+            ['ollama', 'show', model_name],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        
+        info = {
+            'size': 'N/A',
+            'quant': 'N/A',
+            'num_ctx': 'N/A',
+            'params': 'N/A'
+        }
+        
+        current_section = None
+        for line in result.stdout.split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+                
+            if line in ["Model", "Parameters"]:
+                current_section = line
+                continue
+            
+            if current_section == "Model":
+                parts = line.split(maxsplit=1)
+                if len(parts) == 2:
+                    k, v = parts[0].lower(), parts[1].strip()
+                    if 'quantization' in k:
+                        info['quant'] = v
+                    elif 'parameters' in k:
+                        info['params'] = v
+            
+            elif current_section == "Parameters":
+                if 'num_ctx' in line.lower():
+                    parts = line.split(maxsplit=1)
+                    if len(parts) == 2:
+                        info['num_ctx'] = parts[1].strip()
+        
+        return info
+    except subprocess.CalledProcessError:
+        return {'size': 'N/A', 'quant': 'N/A', 'num_ctx': 'N/A', 'params': 'N/A'}
+
+
+def test_model_vram(model_name: str) -> Dict:
+    """
+    Test a model's VRAM usage by loading it with a minimal prompt.
+    Returns dict with model stats and VRAM usage.
+    """
+    print(f"Testing {model_name}...", end=' ', flush=True)
+    
+    # Get model info first
+    info = get_model_info(model_name)
+    
+    # Send a minimal test prompt to force model loading
+    url = f"{get_ollama_url()}/api/generate"
+    prompt_data = {
+        "model": model_name,
+        "prompt": "Reply with only: OK",
+        "stream": False
+    }
+    
+    try:
+        req = urllib.request.Request(
+            url,
+            data=json.dumps(prompt_data).encode('utf-8'),
+            headers={'Content-Type': 'application/json'}
+        )
+        
+        # Send request and wait for model to load
+        with urllib.request.urlopen(req, timeout=30) as response:
+            response.read()  # Wait for completion
+        
+        # Give it a moment to stabilize
+        time.sleep(0.5)
+        
+        # Now check /api/ps for VRAM usage
+        ps_url = f"{get_ollama_url()}/api/ps"
+        with urllib.request.urlopen(ps_url, timeout=5) as r:
+            ps_data = json.loads(r.read().decode())
+            models = ps_data.get('models', [])
+            
+            # Find our model in the running models
+            for m in models:
+                if m['name'] == model_name or m['name'].startswith(model_name + ':'):
+                    size_bytes = m.get('size', 0)
+                    size_vram = m.get('size_vram', 0)
+                    
+                    # Calculate VRAM usage in GB
+                    vram_gb = size_vram / (1024**3) if size_vram > 0 else 0
+                    total_gb = size_bytes / (1024**3) if size_bytes > 0 else 0
+                    
+                    # Calculate offload percentage (how much is on CPU)
+                    if size_bytes > 0:
+                        offload_pct = ((size_bytes - size_vram) / size_bytes) * 100
+                    else:
+                        offload_pct = 0
+                    
+                    print("✓")
+                    
+                    return {
+                        'model': model_name,
+                        'params': info['params'],
+                        'size_gb': total_gb,
+                        'quant': info['quant'],
+                        'num_ctx': info['num_ctx'],
+                        'vram_gb': vram_gb,
+                        'offload_pct': offload_pct,
+                        'success': True
+                    }
+        
+        # Model not found in ps output
+        print("✗ (not in ps)")
+        return {
+            'model': model_name,
+            'params': info['params'],
+            'size_gb': 0,
+            'quant': info['quant'],
+            'num_ctx': info['num_ctx'],
+            'vram_gb': 0,
+            'offload_pct': 0,
+            'success': False
+        }
+        
+    except Exception as e:
+        print(f"✗ ({str(e)[:30]})")
+        return {
+            'model': model_name,
+            'params': info['params'],
+            'size_gb': 0,
+            'quant': info['quant'],
+            'num_ctx': info['num_ctx'],
+            'vram_gb': 0,
+            'offload_pct': 0,
+            'success': False
+        }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Test Ollama models for VRAM usage and CPU offloading',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test all installed models
+  %(prog)s
+  
+  # Test a specific model
+  %(prog)s ministral-3:3b-instruct-2512-q5_k_m
+        """
+    )
+    
+    parser.add_argument(
+        'model',
+        nargs='?',
+        help='Specific model to test (optional, tests all if omitted)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Check if ollama is available
+    try:
+        subprocess.run(['ollama', '--version'], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("✗ Error: 'ollama' command not found. Please install Ollama first.")
+        sys.exit(1)
+    
+    # Determine which models to test
+    if args.model:
+        models = [args.model]
+    else:
+        models = get_installed_models()
+        if not models:
+            print("✗ No models found")
+            sys.exit(1)
+        print(f"Found {len(models)} installed model(s)\n")
+    
+    # Test each model
+    results = []
+    for model in models:
+        result = test_model_vram(model)
+        results.append(result)
+    
+    # Display results table
+    print("\n" + "="*110)
+    print("VRAM USAGE TEST RESULTS")
+    print("="*110)
+    
+    # Column widths
+    w = {'m': 38, 'p': 8, 's': 10, 'q': 10, 'ctx': 10, 'v': 10, 'o': 12}
+    
+    header = (f"{'MODEL':<{w['m']}} {'PARAMS':<{w['p']}} {'SIZE':<{w['s']}} "
+              f"{'QUANT':<{w['q']}} {'NUM_CTX':<{w['ctx']}} {'VRAM':>{w['v']}} {'OFFLOAD':>{w['o']}}")
+    
+    print(header)
+    print("-" * 110)
+    
+    for r in results:
+        # Truncate long model names
+        name = (r['model'][:w['m']-2] + '..') if len(r['model']) > w['m'] else r['model']
+        
+        # Format values
+        size_str = f"{r['size_gb']:.1f} GB" if r['size_gb'] > 0 else "N/A"
+        vram_str = f"{r['vram_gb']:.1f} GB" if r['vram_gb'] > 0 else "N/A"
+        
+        # Offload status
+        if r['success']:
+            if r['offload_pct'] > 0:
+                offload_str = f"{r['offload_pct']:.1f}% CPU"
+            else:
+                offload_str = "0% (GPU only)"
+        else:
+            offload_str = "Failed"
+        
+        print(f"{name:<{w['m']}} {r['params']:<{w['p']}} {size_str:<{w['s']}} "
+              f"{r['quant']:<{w['q']}} {r['num_ctx']:<{w['ctx']}} {vram_str:>{w['v']}} {offload_str:>{w['o']}}")
+    
+    # Summary
+    successful = sum(1 for r in results if r['success'])
+    with_offload = sum(1 for r in results if r['success'] and r['offload_pct'] > 0)
+    
+    print("\n" + "="*110)
+    print(f"Tested: {len(results)} | Successful: {successful} | CPU Offloading: {with_offload}")
+    
+    if with_offload > 0:
+        print(f"\n⚠  {with_offload} model(s) using CPU offloading - consider reducing num_ctx or using smaller quantization")
+
+
+if __name__ == '__main__':
+    main()