ollama-utils/web_app.py

#!/usr/bin/env python3
"""
Ollama-Utils Web Interface
A comprehensive web interface for managing Ollama models and monitoring system resources.
"""

import json
import os
import re
import subprocess
import sys
import urllib.request
import urllib.error
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from flask import Flask, render_template, jsonify, request, send_from_directory
from urllib.parse import urlparse

# Import utilities from existing scripts
sys.path.append(os.path.join(os.path.dirname(__file__), 'scripts'))

# Import existing CLI tools
import importlib.util

# Load vram-test module
vram_test_spec = importlib.util.spec_from_file_location("vram_test", os.path.join(os.path.dirname(__file__), 'scripts', 'vram-test.py'))
vram_test_module = importlib.util.module_from_spec(vram_test_spec)
vram_test_spec.loader.exec_module(vram_test_module)

# Load context-optimizer module
context_optimizer_spec = importlib.util.spec_from_file_location("context_optimizer", os.path.join(os.path.dirname(__file__), 'scripts', 'context-optimizer.py'))
context_optimizer_module = importlib.util.module_from_spec(context_optimizer_spec)
context_optimizer_spec.loader.exec_module(context_optimizer_module)

# Load model-info module
model_info_spec = importlib.util.spec_from_file_location("model_info", os.path.join(os.path.dirname(__file__), 'scripts', 'model-info.py'))
model_info_module = importlib.util.module_from_spec(model_info_spec)
model_info_spec.loader.exec_module(model_info_module)

# Load hf-llm-install module
hf_install_spec = importlib.util.spec_from_file_location("hf_install", os.path.join(os.path.dirname(__file__), 'scripts', 'hf-llm-install.py'))
hf_install_module = importlib.util.module_from_spec(hf_install_spec)
hf_install_spec.loader.exec_module(hf_install_module)

app = Flask(__name__)
app.config['MODELFILE_REPO'] = os.path.join(os.path.dirname(__file__), 'modelfile-repo')

# Global state for background installations
install_jobs = {}
install_lock = threading.Lock()

# ===== UTILITY FUNCTIONS =====

def get_ollama_url():
    """Get the Ollama API URL."""
    return "http://localhost:11434"


def get_gpu_metrics() -> Tuple[Optional[float], Optional[float], Optional[int]]:
    """Get GPU VRAM and load metrics."""
    try:
        device_paths = [
            "/sys/class/drm/card1/device/",
            "/sys/class/drm/card0/device/",
        ]

        for base_path in device_paths:
            if not os.path.exists(base_path):
                continue

            try:
                with open(base_path + "mem_info_vram_used", "r") as f:
                    used = int(f.read().strip()) / 1024 / 1024
                with open(base_path + "mem_info_vram_total", "r") as f:
                    total = int(f.read().strip()) / 1024 / 1024
                with open(base_path + "gpu_busy_percent", "r") as f:
                    load = int(f.read().strip())

                # Sanity check
                if load == 99 and used < (total * 0.1):
                    load = 0

                return used, total, load
            except:
                continue

        return None, None, None
    except:
        return None, None, None


def get_sys_metrics() -> Tuple[float, int, int]:
    """Get system CPU and RAM metrics."""
    try:
        load_avg = os.getloadavg()[0]
        mem_output = subprocess.check_output("free -m", shell=True).decode().split('\n')[1].split()
        ram_used = int(mem_output[2])
        ram_total = int(mem_output[1])
        return load_avg, ram_used, ram_total
    except Exception:
        return 0.0, 0, 0


def get_model_info_detailed(model_name: str) -> Dict:
    """Get detailed model information from 'ollama show'. Uses model-info.py logic."""
    # Get basic list info first
    try:
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
        disk_size = 'N/A'
        for line in result.stdout.strip().split('\n')[1:]:
            if line.strip():
                parts = line.split()
                if len(parts) >= 3 and parts[0] == model_name:
                    disk_size = parts[2]
                    break
    except:
        disk_size = 'N/A'

    # Use the existing get_model_info function from model-info.py
    info = model_info_module.get_model_info(model_name, disk_size)

    # Convert to expected format (model-info uses slightly different keys)
    return {
        'name': model_name,
        'family': info.get('family', 'N/A'),
        'params': info.get('params_str', 'N/A'),
        'quant': info.get('quant', 'N/A'),
        'max_context': info.get('context', 0),
        'context_used': info.get('context_used', 0),
        'capabilities': [cap for cap in info.get('caps', [])],
        'license': 'N/A',
        'system_prompt': '',
        'vram_estimate': info.get('vram', 'N/A')
    }


def check_modelfile_exists(model_name: str) -> Optional[str]:
    """Check if a Modelfile exists for this model in the modelfile-repo directory."""
    modelfile_dir = app.config['MODELFILE_REPO']
    if not os.path.exists(modelfile_dir):
        return None

    # Try exact match first
    modelfile_path = os.path.join(modelfile_dir, f"{model_name}.Modelfile")
    if os.path.exists(modelfile_path):
        return modelfile_path

    # Try with colons replaced by dashes (ministral-3:3b -> ministral-3-3b)
    normalized_name = model_name.replace(':', '-')
    modelfile_path = os.path.join(modelfile_dir, f"{normalized_name}.Modelfile")
    if os.path.exists(modelfile_path):
        return modelfile_path

    return None


def parse_modelfile_metadata(modelfile_path: str) -> Dict:
    """Parse metadata from a Modelfile using hf-llm-install.py logic."""
    try:
        # Use the existing parse_modelfile function from hf-llm-install.py
        model_info = hf_install_module.parse_modelfile(modelfile_path)

        if not model_info:
            return None

        # Extract quantization and other params from the modelfile content
        quantization = None
        num_ctx = None
        family = None
        params = None

        with open(modelfile_path, 'r') as f:
            content = f.read()

            # Extract quantization
            quant_match = re.search(r'#\s*quantization:\s*([a-zA-Z0-9_]+)', content)
            if quant_match:
                quantization = quant_match.group(1).upper()
            else:
                # Extract from filename if not specified
                gguf_filename = model_info.get('gguf_filename', '')
                quant_pattern = re.search(r'[_-](Q[0-9]+_[KLM]+(?:_[LSM])?)\\.gguf', gguf_filename, re.IGNORECASE)
                if quant_pattern:
                    quantization = quant_pattern.group(1).upper()

            # Extract num_ctx
            ctx_match = re.search(r'PARAMETER\s+num_ctx\s+(\d+)', content)
            if ctx_match:
                num_ctx = int(ctx_match.group(1))

        # Extract params and family from model name
        model_name = model_info['model_name']
        # Pattern: modelbase:Xb-variant  (e.g., "ministral-3:3b-instruct-2512-q5_k_m")
        params_match = re.search(r':(\d+)b', model_name, re.IGNORECASE)
        if params_match:
            params = params_match.group(1) + 'B'

        # Extract family from base name
        if ':' in model_name:
            family = model_name.split(':')[0].upper()

        # Get capabilities from model_info (parsed by hf_install_module)
        capabilities = model_info.get('capabilities', [])

        # Convert to expected format
        return {
            'path': modelfile_path,
            'filename': os.path.basename(modelfile_path),
            'model_name': model_info['model_name'],
            'hf_upstream': model_info.get('hf_url'),
            'quantization': quantization or 'unspecified',
            'sha256': model_info.get('sha256'),
            'num_ctx': num_ctx or 0,
            'family': family or 'Unknown',
            'params': params or 'Unknown',
            'capabilities': capabilities or []
        }
    except Exception as e:
        return None


def get_all_modelfiles() -> List[Dict]:
    """Get all modelfiles from the modelfile-repo directory."""
    modelfile_dir = app.config['MODELFILE_REPO']
    if not os.path.exists(modelfile_dir):
        return []

    modelfiles = []
    for filename in os.listdir(modelfile_dir):
        if filename.endswith('.Modelfile'):
            filepath = os.path.join(modelfile_dir, filename)
            metadata = parse_modelfile_metadata(filepath)
            if metadata:
                modelfiles.append(metadata)

    return modelfiles


def run_install_job(job_id: str, modelfile_path: str):
    """Run installation in background thread."""
    with install_lock:
        install_jobs[job_id]['status'] = 'running'
        install_jobs[job_id]['progress'] = 'Starting installation...'

    # Progress callback
    def update_progress(message):
        with install_lock:
            install_jobs[job_id]['progress'] = message

    # Cancellation callback
    def should_cancel():
        with install_lock:
            return install_jobs[job_id].get('cancelled', False)

    try:
        success, skipped, model_name = hf_install_module.install_model(
            modelfile_path,
            dry_run=False,
            skip_existing=False,
            existing_models=None,
            should_cancel=should_cancel,
            progress_callback=update_progress
        )

        with install_lock:
            if success:
                install_jobs[job_id]['status'] = 'completed'
                install_jobs[job_id]['model_name'] = model_name
                install_jobs[job_id]['progress'] = f'Successfully installed {model_name}'
            else:
                install_jobs[job_id]['status'] = 'failed'
                install_jobs[job_id]['error'] = f'Installation failed for {model_name}'

    except InterruptedError as e:
        with install_lock:
            install_jobs[job_id]['status'] = 'cancelled'
            install_jobs[job_id]['error'] = str(e)
    except Exception as e:
        with install_lock:
            # Check if it was actually cancelled before marking as failed
            if install_jobs[job_id].get('cancelled', False):
                install_jobs[job_id]['status'] = 'cancelled'
                install_jobs[job_id]['error'] = 'Installation cancelled by user'
            else:
                install_jobs[job_id]['status'] = 'failed'
                install_jobs[job_id]['error'] = str(e)


def run_huggingface_install_job(job_id: str, model_name: str, modelfile_content: str, file_url: str, gguf_filename: str):
    """Run HuggingFace model installation in background thread."""
    with install_lock:
        install_jobs[job_id]['status'] = 'running'
        install_jobs[job_id]['progress'] = 'Starting download...'

    # Progress callback
    def update_progress(message):
        with install_lock:
            install_jobs[job_id]['progress'] = message

    # Cancellation callback
    def should_cancel():
        with install_lock:
            return install_jobs[job_id].get('cancelled', False)

    temp_gguf = None
    temp_modelfile = None

    try:
        # Create temp files
        import tempfile
        temp_gguf = tempfile.NamedTemporaryFile(suffix='.gguf', delete=False)
        temp_gguf.close()
        gguf_path = temp_gguf.name

        temp_modelfile = tempfile.NamedTemporaryFile(mode='w', suffix='.Modelfile', delete=False)
        temp_modelfile.write(modelfile_content)
        temp_modelfile.close()
        modelfile_path = temp_modelfile.name

        # Use existing download_file function with callbacks
        hf_install_module.download_file(file_url, gguf_path, gguf_filename, should_cancel, update_progress)

        # Use existing create_ollama_model function
        hf_install_module.create_ollama_model(modelfile_path, gguf_path, model_name)

        # Save Modelfile to repo
        normalized_name = model_name.replace(':', '-')
        final_modelfile_path = os.path.join(app.config['MODELFILE_REPO'], f"{normalized_name}.Modelfile")
        os.makedirs(os.path.dirname(final_modelfile_path), exist_ok=True)
        with open(final_modelfile_path, 'w') as f:
            f.write(modelfile_content)

        with install_lock:
            install_jobs[job_id]['status'] = 'completed'
            install_jobs[job_id]['model_name'] = model_name
            install_jobs[job_id]['progress'] = f'Successfully created {model_name}'

    except InterruptedError as e:
        with install_lock:
            install_jobs[job_id]['status'] = 'cancelled'
            install_jobs[job_id]['error'] = 'Installation cancelled by user'
    except Exception as e:
        with install_lock:
            if install_jobs[job_id].get('cancelled', False):
                install_jobs[job_id]['status'] = 'cancelled'
                install_jobs[job_id]['error'] = 'Installation cancelled by user'
            else:
                install_jobs[job_id]['status'] = 'failed'
                install_jobs[job_id]['error'] = str(e)
    finally:
        # Clean up temp files
        if temp_gguf and os.path.exists(temp_gguf.name):
            os.unlink(temp_gguf.name)
        if temp_modelfile and os.path.exists(temp_modelfile.name):
            os.unlink(temp_modelfile.name)


# ===== WEB ROUTES =====

@app.route('/')
def index():
    """Main page."""
    return render_template('index.html')


@app.route('/api/status')
def api_status():
    """Get real-time system status and running models."""
    # Get system metrics
    cpu_load, ram_used, ram_total = get_sys_metrics()
    vram_used, vram_total, gpu_load = get_gpu_metrics()

    # Get running models from /api/ps
    running_models = []
    try:
        url = f"{get_ollama_url()}/api/ps"
        with urllib.request.urlopen(url, timeout=2) as response:
            ps_data = json.loads(response.read().decode())
            for model in ps_data.get('models', []):
                size_vram = model.get('size_vram', 0) / (1024**3)  # GB
                size_total = model.get('size', 0) / (1024**3)  # GB
                offload_pct = ((size_total - size_vram) / size_total * 100) if size_total > 0 else 0

                running_models.append({
                    'name': model.get('name', 'Unknown'),
                    'size_gb': size_total,
                    'vram_gb': size_vram,
                    'offload_pct': offload_pct,
                    'expires_at': model.get('expires_at', '')
                })
    except Exception as e:
        print(f"Error getting running models: {e}")

    return jsonify({
        'cpu_load': round(cpu_load, 2),
        'ram_used_mb': ram_used,
        'ram_total_mb': ram_total,
        'ram_used_pct': round((ram_used / ram_total * 100) if ram_total > 0 else 0, 1),
        'vram_used_mb': round(vram_used) if vram_used is not None else None,
        'vram_total_mb': round(vram_total) if vram_total is not None else None,
        'vram_used_pct': round((vram_used / vram_total * 100) if vram_total and vram_total > 0 else 0, 1),
        'gpu_load': gpu_load,
        'running_models': running_models
    })


@app.route('/api/models')
def api_models():
    """Get list of all installed models and available modelfiles."""
    try:
        result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)

        installed_models = []
        installed_names = set()

        for line in result.stdout.strip().split('\n')[1:]:  # Skip header
            if line.strip():
                parts = line.split()
                if len(parts) >= 3:
                    name = parts[0]
                    installed_names.add(name)
                    model_id = parts[1] if len(parts) > 1 else ''
                    size = parts[2] if len(parts) > 2 else 'N/A'

                    # Check if modelfile exists
                    modelfile_path = check_modelfile_exists(name)
                    has_modelfile = modelfile_path is not None

                    # Get detailed info
                    detailed_info = get_model_info_detailed(name)

                    installed_models.append({
                        'name': name,
                        'id': model_id,
                        'size': size,
                        'installed': True,
                        'has_modelfile': has_modelfile,
                        'modelfile_path': modelfile_path,
                        'family': detailed_info['family'],
                        'params': detailed_info['params'],
                        'quant': detailed_info['quant'],
                        'max_context': detailed_info['max_context'],
                        'context_used': detailed_info['context_used'],
                        'capabilities': detailed_info['capabilities'],
                        'vram_estimate': detailed_info['vram_estimate']
                    })

        # Get all modelfiles
        all_modelfiles = get_all_modelfiles()
        available_modelfiles = []

        for mf in all_modelfiles:
            # Check if this modelfile's model is already installed
            if mf['model_name'] not in installed_names:
                available_modelfiles.append({
                    'name': mf['model_name'],
                    'installed': False,
                    'has_modelfile': True,
                    'modelfile_path': mf['path'],
                    'hf_upstream': mf['hf_upstream'],
                    'quantization': mf['quantization'],
                    'family': mf.get('family', 'Unknown'),
                    'params': mf.get('params', 'Unknown'),
                    'quant': mf['quantization'],
                    'max_context': mf.get('num_ctx', 0),
                    'context_used': 0,
                    'capabilities': mf.get('capabilities', []),
                    'vram_estimate': 'N/A',
                    'size': 'Not installed'
                })

        # Combine installed and available
        all_models = installed_models + available_modelfiles

        return jsonify({
            'models': all_models,
            'installed_count': len(installed_models),
            'available_count': len(available_modelfiles)
        })
    except subprocess.CalledProcessError as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/model/<path:model_name>')
def api_model_detail(model_name):
    """Get detailed information about a specific model."""
    info = get_model_info_detailed(model_name)
    modelfile_path = check_modelfile_exists(model_name)

    return jsonify({
        'info': info,
        'has_modelfile': modelfile_path is not None,
        'modelfile_path': modelfile_path
    })


@app.route('/api/modelfile/<path:model_name>')
def api_get_modelfile(model_name):
    """Get the Modelfile content for a model."""
    modelfile_path = check_modelfile_exists(model_name)

    if not modelfile_path or not os.path.exists(modelfile_path):
        return jsonify({'error': 'Modelfile not found'}), 404

    try:
        with open(modelfile_path, 'r') as f:
            content = f.read()
        return jsonify({
            'path': modelfile_path,
            'content': content
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/modelfile/<path:model_name>', methods=['POST'])
def api_save_modelfile(model_name):
    """Save Modelfile content and optionally recreate the model."""
    data = request.get_json()
    content = data.get('content', '')
    recreate_model = data.get('recreate_model', False)

    if not content:
        return jsonify({'error': 'No content provided'}), 400

    # Determine the modelfile path
    modelfile_path = check_modelfile_exists(model_name)

    if not modelfile_path:
        # Create new Modelfile
        normalized_name = model_name.replace(':', '-')
        modelfile_path = os.path.join(app.config['MODELFILE_REPO'], f"{normalized_name}.Modelfile")

    try:
        # Ensure directory exists
        os.makedirs(os.path.dirname(modelfile_path), exist_ok=True)

        # Save the modelfile
        with open(modelfile_path, 'w') as f:
            f.write(content)

        # If there are changes, start background job to recreate the model
        if recreate_model:
            # Create job ID
            job_id = f"recreate_{int(time.time() * 1000)}"

            # Initialize job state
            with install_lock:
                install_jobs[job_id] = {
                    'status': 'queued',
                    'progress': 'Queued for recreation',
                    'modelfile_path': modelfile_path,
                    'model_name': model_name,
                    'error': None,
                    'cancelled': False
                }

            # Start background thread
            thread = threading.Thread(target=run_install_job, args=(job_id, modelfile_path))
            thread.daemon = True
            thread.start()

            return jsonify({
                'success': True,
                'path': modelfile_path,
                'job_id': job_id,
                'recreating': True
            })

        return jsonify({
            'success': True,
            'path': modelfile_path,
            'recreating': False
        })
    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/model/<path:model_name>', methods=['DELETE'])
def api_delete_model(model_name):
    """Delete a model."""
    try:
        subprocess.run(['ollama', 'rm', model_name], check=True, capture_output=True)
        return jsonify({'success': True})
    except subprocess.CalledProcessError as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/install/ollama', methods=['POST'])
def api_install_ollama_model():
    """Install a model from Ollama library."""
    data = request.get_json()
    model_name = data.get('model_name', '')

    if not model_name:
        return jsonify({'error': 'No model name provided'}), 400

    try:
        # Run ollama pull in background
        process = subprocess.Popen(
            ['ollama', 'pull', model_name],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        # Read output
        stdout, stderr = process.communicate()

        if process.returncode == 0:
            return jsonify({
                'success': True,
                'message': f'Successfully pulled {model_name}',
                'output': stdout
            })
        else:
            return jsonify({
                'error': f'Failed to pull model: {stderr}'
            }), 500

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/install/huggingface', methods=['POST'])
def api_install_huggingface():
    """Process HuggingFace URL and return Modelfile skeleton or list of GGUF files."""
    data = request.get_json()
    hf_url = data.get('url', '')
    selected_file = data.get('selected_file', None)  # For when user selects from dropdown

    if not hf_url:
        return jsonify({'error': 'No URL provided'}), 400

    try:
        # Parse the URL
        parsed = urlparse(hf_url)
        path_parts = parsed.path.strip('/').split('/')

        if len(path_parts) < 2:
            return jsonify({'error': 'Invalid HuggingFace URL'}), 400

        org = path_parts[0]
        repo = path_parts[1]

        # Check if it's a direct GGUF file link
        if hf_url.endswith('.gguf') or '/blob/' in hf_url or '/resolve/' in hf_url:
            # Direct GGUF file URL
            gguf_filename = os.path.basename(parsed.path)
            file_url = hf_url.replace('/blob/', '/resolve/')

            return generate_modelfile_response(org, repo, gguf_filename, file_url)

        elif selected_file:
            # User selected a file from dropdown
            file_url = f"https://huggingface.co/{org}/{repo}/resolve/main/{selected_file}"

            return generate_modelfile_response(org, repo, selected_file, file_url)

        else:
            # Repository root - fetch available GGUF files
            api_url = f"https://huggingface.co/api/models/{org}/{repo}"

            with urllib.request.urlopen(api_url, timeout=10) as response:
                model_data = json.loads(response.read().decode())

            # Extract GGUF files from siblings
            gguf_files = []
            for sibling in model_data.get('siblings', []):
                filename = sibling.get('rfilename', '')
                if filename.lower().endswith('.gguf'):
                    size_bytes = sibling.get('size', 0)
                    size_gb = size_bytes / (1024**3) if size_bytes else 0
                    gguf_files.append({
                        'filename': filename,
                        'size': f"{size_gb:.2f} GB" if size_gb > 0 else "Unknown size"
                    })

            if not gguf_files:
                return jsonify({
                    'error': f'No GGUF files found in repository {org}/{repo}'
                }), 404

            # Return list of files for user to choose from
            return jsonify({
                'success': True,
                'requires_selection': True,
                'org': org,
                'repo': repo,
                'repo_url': f"https://huggingface.co/{org}/{repo}",
                'gguf_files': gguf_files
            })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


def generate_modelfile_response(org: str, repo: str, gguf_filename: str, file_url: str):
    """Generate modelfile from GGUF filename using same logic as hf-llm-install.py."""
    try:
        # Use shared parsing function from hf-llm-install.py
        model_base, tag, full_name = hf_install_module.parse_model_name_from_gguf(gguf_filename)

        # Extract quantization for metadata
        quant_match = re.search(r'[._-](Q[0-9]+_[KLM0-9]+(?:_[LSM])?)', gguf_filename, re.IGNORECASE)
        quantization = quant_match.group(1).upper() if quant_match else 'unspecified'

        # Create Modelfile skeleton with relative path (like CLI does)
        modelfile_content = f"""# Modelfile for {full_name}
# hf_upstream: {file_url}
# quantization: {quantization}
# capabilities: tools
# sha256: <add_sha256_checksum_here>

FROM ./{gguf_filename}

# System prompt - customize for your use case
SYSTEM \"\"\"You are a helpful AI assistant.\"\"\"

# Parameters - refer to manufacturer's recommendations
# https://huggingface.co/{org}/{repo}
PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER top_k 40
PARAMETER num_ctx 8192
PARAMETER repeat_penalty 1.1
PARAMETER stop "<|im_end|>"
PARAMETER stop "<|end|>"
PARAMETER stop "</s>"

# Template - adjust based on model's chat template
TEMPLATE \"\"\"{{{{ if .System }}}}<|im_start|>system
{{{{ .System }}}}<|im_end|>
{{{{ end }}}}{{{{ if .Prompt }}}}<|im_start|>user
{{{{ .Prompt }}}}<|im_end|>
{{{{ end }}}}<|im_start|>assistant
{{{{ .Response }}}}<|im_end|>
\"\"\"
"""

        return jsonify({
            'success': True,
            'requires_selection': False,
            'model_name': model_base,
            'tag': tag,
            'full_name': full_name,
            'gguf_filename': gguf_filename,
            'file_url': file_url,
            'repo_url': f"https://huggingface.co/{org}/{repo}",
            'modelfile_content': modelfile_content
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/install/huggingface/create', methods=['POST'])
def api_create_from_modelfile():
    """Start HuggingFace model creation as background job."""
    data = request.get_json()
    model_name = data.get('model_name', '').strip()
    modelfile_content = data.get('modelfile_content', '')
    file_url = data.get('file_url', '')
    gguf_filename = data.get('gguf_filename', '')

    if not model_name or not modelfile_content or not file_url:
        return jsonify({'error': 'Missing required parameters'}), 400

    try:
        # Create job ID
        job_id = f"hf_install_{int(time.time() * 1000)}"

        # Initialize job state
        with install_lock:
            install_jobs[job_id] = {
                'status': 'queued',
                'progress': 'Queued for download',
                'model_name': model_name,
                'error': None,
                'cancelled': False
            }

        # Start background thread
        thread = threading.Thread(
            target=run_huggingface_install_job,
            args=(job_id, model_name, modelfile_content, file_url, gguf_filename)
        )
        thread.daemon = True
        thread.start()

        return jsonify({
            'success': True,
            'job_id': job_id,
            'message': 'Installation started'
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/install/modelfile', methods=['POST'])
def api_install_from_modelfile():
    """Start installation of a model from an existing Modelfile as background job."""
    try:
        data = request.get_json()
        modelfile_path = data.get('modelfile_path', '')

        if not modelfile_path:
            return jsonify({'error': 'No modelfile path provided'}), 400

        if not os.path.exists(modelfile_path):
            return jsonify({'error': 'Modelfile not found'}), 404

        # Create job ID
        job_id = f"install_{int(time.time() * 1000)}"

        # Initialize job state
        with install_lock:
            install_jobs[job_id] = {
                'status': 'queued',
                'progress': 'Queued for installation',
                'modelfile_path': modelfile_path,
                'model_name': None,
                'error': None,
                'cancelled': False
            }

        # Start background thread
        thread = threading.Thread(target=run_install_job, args=(job_id, modelfile_path))
        thread.daemon = True
        thread.start()

        return jsonify({
            'success': True,
            'job_id': job_id,
            'message': 'Installation started'
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500


@app.route('/api/install/status/<job_id>', methods=['GET'])
def api_install_status(job_id):
    """Get status of an installation job."""
    with install_lock:
        if job_id not in install_jobs:
            return jsonify({'error': 'Job not found'}), 404

        job = install_jobs[job_id].copy()

    return jsonify({
        'status': job['status'],
        'progress': job['progress'],
        'model_name': job.get('model_name'),
        'error': job.get('error')
    })


@app.route('/api/install/active', methods=['GET'])
def api_install_active():
    """Get all active (running or queued) installation jobs."""
    with install_lock:
        active = {}
        for job_id, job in install_jobs.items():
            if job['status'] in ['queued', 'running']:
                active[job_id] = {
                    'status': job['status'],
                    'modelfile_path': job['modelfile_path'],
                    'model_name': job.get('model_name')
                }
        return jsonify(active)


@app.route('/api/install/cancel/<job_id>', methods=['POST'])
def api_install_cancel(job_id):
    """Cancel an installation job."""
    with install_lock:
        if job_id not in install_jobs:
            return jsonify({'error': 'Job not found'}), 404

        if install_jobs[job_id]['status'] in ['completed', 'failed', 'cancelled']:
            return jsonify({'error': 'Job already finished'}), 400

        install_jobs[job_id]['cancelled'] = True

    return jsonify({'success': True})


@app.route('/api/performance/vram-test/<path:model_name>', methods=['POST'])
def api_vram_test(model_name):
    """Test VRAM usage for a specific model or all models."""
    try:
        import time

        # Check if testing all models
        if model_name == '_all_':
            # Get all installed models
            result = subprocess.run(['ollama', 'list'], capture_output=True, text=True, check=True)
            models_to_test = []
            for line in result.stdout.strip().split('\n')[1:]:
                if line.strip():
                    parts = line.split()
                    if len(parts) >= 1:
                        models_to_test.append(parts[0])

            # Test each model
            results = []
            for model in models_to_test:
                result = test_single_model_vram(model)
                results.append(result)
                time.sleep(0.5)  # Brief pause between tests

            return jsonify({
                'success': True,
                'results': results
            })
        else:
            # Test single model
            result = test_single_model_vram(model_name)
            return jsonify(result)

    except Exception as e:
        return jsonify({'error': str(e)}), 500


def test_single_model_vram(model_name: str) -> Dict:
    """Test VRAM usage for a single model. Uses vram-test.py logic."""
    # Use the existing test_model_vram function from vram-test.py
    return vram_test_module.test_model_vram(model_name)


@app.route('/api/performance/optimize/<path:model_name>', methods=['POST'])
def api_optimize_context(model_name):
    """Run context optimizer for a specific model. Uses context-optimizer.py logic."""
    try:
        # Get parameters from request
        data = request.get_json() or {}
        overhead_gb = float(data.get('overhead_gb', 1.0))
        max_turns = int(data.get('max_turns', 20))

        # Use the existing find_optimal_context function from context-optimizer.py
        result = context_optimizer_module.find_optimal_context(model_name, max_turns=max_turns, overhead_gb=overhead_gb)

        if not result or 'results' not in result:
            return jsonify({
                'success': False,
                'error': 'Optimization failed or no results returned'
            })

        # Extract data from results
        test_results = []
        optimal_context = 0

        for r in result.get('results', []):
            test_results.append({
                'context_size': r.get('num_ctx', 0),
                'vram_gb': round(r.get('vram_gb', 0), 2),
                'offload_pct': round(r.get('offload_pct', 0), 1),
                'fits': r.get('offload_pct', 100) == 0
            })

            # Track optimal (largest that fits)
            if r.get('offload_pct', 100) == 0:
                optimal_context = max(optimal_context, r.get('num_ctx', 0))

        # Get VRAM info
        vram_total, vram_available = context_optimizer_module.get_gpu_vram()

        return jsonify({
            'success': True,
            'model': model_name,
            'max_context': result.get('max_context', 0),
            'current_context': result.get('current_ctx', 0),
            'optimal_context': result.get('recommended_ctx', optimal_context),
            'available_vram_gb': round(vram_available, 2) if vram_available else 0,
            'results': test_results,
            'summary': result.get('summary', '')
        })

    except Exception as e:
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500


if __name__ == '__main__':
    print("Starting Ollama-Utils Web Interface...")
    print("Access the interface at: http://localhost:5000")
    app.run(host='0.0.0.0', port=5000, debug=True)