1339 lines
51 KiB
HTML
1339 lines
51 KiB
HTML
<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>LLM Evaluation Dashboard</title>
|
||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
||
<style>
|
||
* {
|
||
margin: 0;
|
||
padding: 0;
|
||
box-sizing: border-box;
|
||
}
|
||
|
||
:root {
|
||
--bg-gradient-start: #667eea;
|
||
--bg-gradient-end: #764ba2;
|
||
--card-bg: #ffffff;
|
||
--text-primary: #333333;
|
||
--text-secondary: #666666;
|
||
--border-color: #e0e0e0;
|
||
--stat-card-bg: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
|
||
--shadow: rgba(0,0,0,0.1);
|
||
--shadow-hover: rgba(0,0,0,0.15);
|
||
}
|
||
|
||
body.dark-mode {
|
||
--bg-gradient-start: #1a1a2e;
|
||
--bg-gradient-end: #16213e;
|
||
--card-bg: #0f1419;
|
||
--text-primary: #e0e0e0;
|
||
--text-secondary: #a0a0a0;
|
||
--border-color: #2a2a3e;
|
||
--stat-card-bg: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
||
--shadow: rgba(0,0,0,0.3);
|
||
--shadow-hover: rgba(0,0,0,0.5);
|
||
}
|
||
|
||
body {
|
||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
||
background: linear-gradient(135deg, var(--bg-gradient-start) 0%, var(--bg-gradient-end) 100%);
|
||
color: var(--text-primary);
|
||
min-height: 100vh;
|
||
padding: 20px;
|
||
transition: all 0.3s ease;
|
||
}
|
||
|
||
.container {
|
||
max-width: 1400px;
|
||
margin: 0 auto;
|
||
}
|
||
|
||
header {
|
||
background: var(--card-bg);
|
||
padding: 30px;
|
||
border-radius: 15px;
|
||
box-shadow: 0 10px 40px var(--shadow);
|
||
margin-bottom: 30px;
|
||
position: relative;
|
||
}
|
||
|
||
.theme-toggle {
|
||
position: absolute;
|
||
top: 30px;
|
||
right: 30px;
|
||
background: var(--border-color);
|
||
border: none;
|
||
padding: 10px 20px;
|
||
border-radius: 20px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.theme-toggle:hover {
|
||
transform: scale(1.05);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.scale-toggle {
|
||
position: absolute;
|
||
top: 30px;
|
||
right: 240px;
|
||
background: var(--border-color);
|
||
border: none;
|
||
padding: 10px 20px;
|
||
border-radius: 20px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.scale-toggle:hover {
|
||
transform: scale(1.05);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.scale-toggle.zoomed {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
h1 {
|
||
font-size: 2.5em;
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
-webkit-background-clip: text;
|
||
-webkit-text-fill-color: transparent;
|
||
margin-bottom: 10px;
|
||
}
|
||
|
||
.subtitle {
|
||
color: var(--text-secondary);
|
||
font-size: 1.1em;
|
||
}
|
||
|
||
.tabs {
|
||
display: flex;
|
||
gap: 10px;
|
||
margin-bottom: 20px;
|
||
flex-wrap: wrap;
|
||
}
|
||
|
||
.tab {
|
||
background: var(--card-bg);
|
||
border: none;
|
||
padding: 12px 24px;
|
||
border-radius: 8px;
|
||
cursor: pointer;
|
||
font-size: 1em;
|
||
transition: all 0.3s;
|
||
box-shadow: 0 2px 10px var(--shadow);
|
||
color: var(--text-primary);
|
||
}
|
||
|
||
.tab:hover {
|
||
transform: translateY(-2px);
|
||
box-shadow: 0 4px 15px var(--shadow-hover);
|
||
}
|
||
|
||
.tab.active {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
.content-panel {
|
||
display: none;
|
||
background: var(--card-bg);
|
||
padding: 30px;
|
||
border-radius: 15px;
|
||
box-shadow: 0 10px 40px var(--shadow);
|
||
animation: fadeIn 0.3s;
|
||
}
|
||
|
||
.content-panel.active {
|
||
display: block;
|
||
}
|
||
|
||
@keyframes fadeIn {
|
||
from { opacity: 0; transform: translateY(10px); }
|
||
to { opacity: 1; transform: translateY(0); }
|
||
}
|
||
|
||
.stats-grid {
|
||
display: grid;
|
||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||
gap: 20px;
|
||
margin-bottom: 30px;
|
||
}
|
||
|
||
.stat-card {
|
||
background: var(--stat-card-bg);
|
||
padding: 20px;
|
||
border-radius: 10px;
|
||
text-align: center;
|
||
}
|
||
|
||
.stat-card h3 {
|
||
font-size: 0.9em;
|
||
color: var(--text-secondary);
|
||
margin-bottom: 10px;
|
||
text-transform: uppercase;
|
||
}
|
||
|
||
.stat-card .value {
|
||
font-size: 2.5em;
|
||
font-weight: bold;
|
||
color: #667eea;
|
||
}
|
||
|
||
.chart-container {
|
||
position: relative;
|
||
height: 400px;
|
||
margin-bottom: 30px;
|
||
}
|
||
|
||
.controls {
|
||
display: flex;
|
||
gap: 15px;
|
||
margin-bottom: 20px;
|
||
flex-wrap: wrap;
|
||
}
|
||
|
||
select, input {
|
||
padding: 10px 15px;
|
||
border: 2px solid var(--border-color);
|
||
border-radius: 8px;
|
||
font-size: 1em;
|
||
background: var(--card-bg);
|
||
color: var(--text-primary);
|
||
cursor: pointer;
|
||
transition: border-color 0.3s;
|
||
}
|
||
|
||
select:hover, input:hover {
|
||
border-color: #667eea;
|
||
}
|
||
|
||
select:focus, input:focus {
|
||
outline: none;
|
||
border-color: #764ba2;
|
||
}
|
||
|
||
table {
|
||
width: 100%;
|
||
border-collapse: collapse;
|
||
margin-top: 20px;
|
||
}
|
||
|
||
th, td {
|
||
padding: 12px;
|
||
text-align: left;
|
||
border-bottom: 1px solid var(--border-color);
|
||
}
|
||
|
||
th {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
font-weight: 600;
|
||
cursor: pointer;
|
||
user-select: none;
|
||
}
|
||
|
||
th:hover {
|
||
opacity: 0.9;
|
||
}
|
||
|
||
tr:hover {
|
||
background: var(--border-color);
|
||
}
|
||
|
||
.score-badge {
|
||
display: inline-block;
|
||
padding: 5px 12px;
|
||
border-radius: 20px;
|
||
font-weight: bold;
|
||
font-size: 0.9em;
|
||
}
|
||
|
||
.score-exceptional {
|
||
background: #10b981;
|
||
color: white;
|
||
}
|
||
|
||
.score-pass {
|
||
background: #f59e0b;
|
||
color: white;
|
||
}
|
||
|
||
.score-fail {
|
||
background: #ef4444;
|
||
color: white;
|
||
}
|
||
|
||
.loading {
|
||
text-align: center;
|
||
padding: 40px;
|
||
color: var(--text-secondary);
|
||
}
|
||
|
||
.spinner {
|
||
border: 3px solid var(--border-color);
|
||
border-top: 3px solid #667eea;
|
||
border-radius: 50%;
|
||
width: 40px;
|
||
height: 40px;
|
||
animation: spin 1s linear infinite;
|
||
margin: 20px auto;
|
||
}
|
||
|
||
@keyframes spin {
|
||
0% { transform: rotate(0deg); }
|
||
100% { transform: rotate(360deg); }
|
||
}
|
||
|
||
.model-selector {
|
||
display: flex;
|
||
gap: 10px;
|
||
flex-wrap: wrap;
|
||
margin-bottom: 20px;
|
||
}
|
||
|
||
.model-chip {
|
||
padding: 8px 16px;
|
||
border-radius: 20px;
|
||
border: 2px solid #667eea;
|
||
background: var(--card-bg);
|
||
color: var(--text-primary);
|
||
cursor: pointer;
|
||
transition: all 0.3s;
|
||
}
|
||
|
||
.model-chip:hover {
|
||
background: #667eea;
|
||
color: white;
|
||
}
|
||
|
||
.model-chip.selected {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
.metric-card {
|
||
background: var(--card-bg);
|
||
border: 2px solid var(--border-color);
|
||
border-radius: 10px;
|
||
padding: 20px;
|
||
margin-bottom: 20px;
|
||
}
|
||
|
||
.metric-card h3 {
|
||
color: #667eea;
|
||
margin-bottom: 15px;
|
||
}
|
||
|
||
.progress-bar {
|
||
background: var(--border-color);
|
||
height: 30px;
|
||
border-radius: 15px;
|
||
overflow: hidden;
|
||
margin: 10px 0;
|
||
position: relative;
|
||
cursor: help;
|
||
}
|
||
|
||
.progress-fill {
|
||
height: 100%;
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
transition: width 0.5s;
|
||
display: flex;
|
||
align-items: center;
|
||
justify-content: flex-end;
|
||
padding-right: 10px;
|
||
color: white;
|
||
font-weight: bold;
|
||
}
|
||
|
||
/* Tooltip styles */
|
||
.tooltip {
|
||
position: relative;
|
||
display: inline-block;
|
||
}
|
||
|
||
.tooltip .tooltiptext {
|
||
visibility: hidden;
|
||
width: 300px;
|
||
background-color: rgba(0, 0, 0, 0.9);
|
||
color: #fff;
|
||
text-align: left;
|
||
border-radius: 8px;
|
||
padding: 12px;
|
||
position: absolute;
|
||
z-index: 1000;
|
||
bottom: 125%;
|
||
left: 50%;
|
||
margin-left: -150px;
|
||
opacity: 0;
|
||
transition: opacity 0.3s;
|
||
font-size: 0.85em;
|
||
line-height: 1.4;
|
||
box-shadow: 0 4px 20px rgba(0,0,0,0.3);
|
||
}
|
||
|
||
.tooltip .tooltiptext::after {
|
||
content: "";
|
||
position: absolute;
|
||
top: 100%;
|
||
left: 50%;
|
||
margin-left: -5px;
|
||
border-width: 5px;
|
||
border-style: solid;
|
||
border-color: rgba(0, 0, 0, 0.9) transparent transparent transparent;
|
||
}
|
||
|
||
.tooltip:hover .tooltiptext {
|
||
visibility: visible;
|
||
opacity: 1;
|
||
}
|
||
|
||
.tooltiptext code {
|
||
background: rgba(255, 255, 255, 0.1);
|
||
padding: 2px 6px;
|
||
border-radius: 3px;
|
||
font-family: monospace;
|
||
font-size: 0.9em;
|
||
}
|
||
|
||
.tooltiptext strong {
|
||
color: #667eea;
|
||
}
|
||
|
||
.chart-type-toggle {
|
||
display: flex;
|
||
gap: 5px;
|
||
background: var(--border-color);
|
||
border-radius: 8px;
|
||
padding: 4px;
|
||
}
|
||
|
||
.chart-type-btn {
|
||
padding: 8px 16px;
|
||
border: none;
|
||
border-radius: 6px;
|
||
cursor: pointer;
|
||
font-size: 0.9em;
|
||
transition: all 0.3s;
|
||
background: transparent;
|
||
color: var(--text-primary);
|
||
}
|
||
|
||
.chart-type-btn:hover {
|
||
background: rgba(102, 126, 234, 0.2);
|
||
}
|
||
|
||
.chart-type-btn.active {
|
||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||
color: white;
|
||
}
|
||
|
||
.model-toggles {
|
||
display: flex;
|
||
flex-wrap: wrap;
|
||
gap: 10px;
|
||
margin-top: 15px;
|
||
padding: 15px;
|
||
background: var(--stat-card-bg);
|
||
border-radius: 10px;
|
||
}
|
||
|
||
.model-toggle-item {
|
||
display: flex;
|
||
align-items: center;
|
||
gap: 8px;
|
||
padding: 6px 12px;
|
||
background: var(--card-bg);
|
||
border-radius: 20px;
|
||
cursor: pointer;
|
||
transition: all 0.3s;
|
||
border: 2px solid transparent;
|
||
}
|
||
|
||
.model-toggle-item:hover {
|
||
border-color: var(--text-secondary);
|
||
}
|
||
|
||
.model-toggle-item.active {
|
||
border-color: currentColor;
|
||
}
|
||
|
||
.model-toggle-checkbox {
|
||
width: 18px;
|
||
height: 18px;
|
||
cursor: pointer;
|
||
accent-color: #667eea;
|
||
}
|
||
|
||
.model-toggle-label {
|
||
font-size: 0.9em;
|
||
cursor: pointer;
|
||
user-select: none;
|
||
}
|
||
|
||
.model-color-indicator {
|
||
width: 12px;
|
||
height: 12px;
|
||
border-radius: 50%;
|
||
flex-shrink: 0;
|
||
}
|
||
|
||
.radar-controls {
|
||
display: none;
|
||
}
|
||
|
||
.radar-controls.visible {
|
||
display: block;
|
||
}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="container">
|
||
<header> <button class="scale-toggle" id="scaleToggle" onclick="toggleScale()" title="Toggle between full scale (0-5) and zoomed view for better distinction">🔍 Full Scale</button> <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
|
||
<h1>🧠 LLM Evaluation Dashboard</h1>
|
||
<p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
|
||
</header>
|
||
|
||
<div class="tabs">
|
||
<button class="tab active" onclick="switchTab('overview')">📊 Overview</button>
|
||
<button class="tab" onclick="switchTab('comparison')">⚔️ Model Comparison</button>
|
||
<button class="tab" onclick="switchTab('intelligence')">🎯 Intelligence Metrics</button>
|
||
<button class="tab" onclick="switchTab('categories')">📂 Category Analysis</button>
|
||
<button class="tab" onclick="switchTab('details')">🔍 Detailed Results</button>
|
||
</div>
|
||
|
||
<div id="overview" class="content-panel active">
|
||
<h2>System Overview</h2>
|
||
<div class="stats-grid" id="overviewStats">
|
||
<div class="loading">
|
||
<div class="spinner"></div>
|
||
Loading data...
|
||
</div>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="overviewChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="comparison" class="content-panel">
|
||
<h2>Model Performance Comparison</h2>
|
||
<div class="controls">
|
||
<select id="metricSelect" onchange="updateComparisonChart()">
|
||
<option value="average">Average Score</option>
|
||
<option value="pass_rate">Pass Rate</option>
|
||
<option value="exceptional_rate">Exceptional Rate</option>
|
||
<option value="consistency">Consistency</option>
|
||
<option value="robustness">Robustness</option>
|
||
</select>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="comparisonChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="intelligence" class="content-panel">
|
||
<h2>Intelligence Metrics Analysis</h2>
|
||
<p style="margin-bottom: 20px; color: #666;">
|
||
Advanced metrics evaluating different dimensions of AI intelligence and reasoning capabilities.
|
||
</p>
|
||
<div id="intelligenceMetrics">
|
||
<div class="loading">
|
||
<div class="spinner"></div>
|
||
Calculating intelligence metrics...
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="categories" class="content-panel">
|
||
<h2>Performance by Category</h2>
|
||
<div class="controls">
|
||
<select id="categorySelect" onchange="updateCategoryChart()">
|
||
<option value="">Loading categories...</option>
|
||
</select>
|
||
<div class="chart-type-toggle">
|
||
<button class="chart-type-btn active" onclick="setCategoryChartType('bar')" id="barChartBtn">📊 Bar Chart</button>
|
||
<button class="chart-type-btn" onclick="setCategoryChartType('radar')" id="radarChartBtn">🕸️ Spider Web</button>
|
||
</div>
|
||
</div>
|
||
<div class="radar-controls" id="radarModelToggles">
|
||
<p style="margin-bottom: 10px; color: var(--text-secondary); font-size: 0.9em;">Toggle models to show/hide on the spider web chart:</p>
|
||
<div class="model-toggles" id="modelToggleContainer">
|
||
<!-- Model toggles will be populated dynamically -->
|
||
</div>
|
||
</div>
|
||
<div class="chart-container">
|
||
<canvas id="categoryChart"></canvas>
|
||
</div>
|
||
</div>
|
||
|
||
<div id="details" class="content-panel">
|
||
<h2>Detailed Test Results</h2>
|
||
<div class="controls">
|
||
<select id="modelSelect" onchange="loadModelDetails()">
|
||
<option value="">Select a model...</option>
|
||
</select>
|
||
<input type="text" id="searchInput" placeholder="Search tests..." onkeyup="filterTable()">
|
||
<select id="filterCategory" onchange="filterTable()">
|
||
<option value="">All Categories</option>
|
||
</select>
|
||
<select id="filterScore" onchange="filterTable()">
|
||
<option value="">All Scores</option>
|
||
<option value="exceptional">Exceptional (4-5)</option>
|
||
<option value="pass">Pass (2-3)</option>
|
||
<option value="fail">Fail (0-1)</option>
|
||
</select>
|
||
</div>
|
||
<div id="detailsTable">
|
||
<p class="loading">Select a model to view detailed results</p>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<script>
|
||
let comparisonData = null;
|
||
let statisticsData = null;
|
||
let intelligenceData = null;
|
||
let currentModelDetails = null;
|
||
let zoomedScale = false;
|
||
let overviewChartInstance = null;
|
||
|
||
// Theme toggle functionality
|
||
function toggleTheme() {
|
||
document.body.classList.toggle('dark-mode');
|
||
const isDark = document.body.classList.contains('dark-mode');
|
||
localStorage.setItem('darkMode', isDark ? 'enabled' : 'disabled');
|
||
}
|
||
|
||
// Load theme preference
|
||
function loadThemePreference() {
|
||
const darkMode = localStorage.getItem('darkMode');
|
||
if (darkMode === 'enabled') {
|
||
document.body.classList.add('dark-mode');
|
||
}
|
||
}
|
||
|
||
// Scale toggle functionality
|
||
function toggleScale() {
|
||
zoomedScale = !zoomedScale;
|
||
const btn = document.getElementById('scaleToggle');
|
||
if (zoomedScale) {
|
||
btn.textContent = '🔎 Zoomed';
|
||
btn.classList.add('zoomed');
|
||
} else {
|
||
btn.textContent = '🔍 Full Scale';
|
||
btn.classList.remove('zoomed');
|
||
}
|
||
localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled');
|
||
// Refresh all charts with new scale
|
||
refreshAllCharts();
|
||
}
|
||
|
||
// Load scale preference
|
||
function loadScalePreference() {
|
||
const savedScale = localStorage.getItem('zoomedScale');
|
||
if (savedScale === 'enabled') {
|
||
zoomedScale = true;
|
||
const btn = document.getElementById('scaleToggle');
|
||
btn.textContent = '🔎 Zoomed';
|
||
btn.classList.add('zoomed');
|
||
}
|
||
}
|
||
|
||
// Calculate optimal Y-axis range for zoomed view
|
||
function getScaleOptions(data, isRadar = false) {
|
||
if (!zoomedScale) {
|
||
// Full scale: 0 to 5
|
||
if (isRadar) {
|
||
return { r: { beginAtZero: true, max: 5 } };
|
||
}
|
||
return { y: { beginAtZero: true, max: 5 } };
|
||
}
|
||
|
||
// Zoomed scale: calculate min/max with padding
|
||
const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d));
|
||
if (validData.length === 0) {
|
||
if (isRadar) {
|
||
return { r: { beginAtZero: true, max: 5 } };
|
||
}
|
||
return { y: { beginAtZero: true, max: 5 } };
|
||
}
|
||
|
||
const minVal = Math.min(...validData);
|
||
const maxVal = Math.max(...validData);
|
||
const range = maxVal - minVal;
|
||
const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding
|
||
|
||
let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10);
|
||
let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10);
|
||
|
||
// Ensure we have at least some range
|
||
if (max - min < 0.5) {
|
||
min = Math.max(0, minVal - 0.3);
|
||
max = Math.min(5, maxVal + 0.3);
|
||
}
|
||
|
||
if (isRadar) {
|
||
return { r: { min: min, max: max, beginAtZero: false } };
|
||
}
|
||
return { y: { min: min, max: max, beginAtZero: false } };
|
||
}
|
||
|
||
// Refresh all charts when scale changes
|
||
function refreshAllCharts() {
|
||
if (comparisonData) {
|
||
refreshOverviewChart();
|
||
updateComparisonChart();
|
||
updateCategoryChart();
|
||
}
|
||
}
|
||
|
||
// Tab switching
|
||
function switchTab(tabName) {
|
||
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
||
document.querySelectorAll('.content-panel').forEach(p => p.classList.remove('active'));
|
||
|
||
event.target.classList.add('active');
|
||
document.getElementById(tabName).classList.add('active');
|
||
}
|
||
|
||
// Initialize dashboard
|
||
async function initDashboard() {
|
||
loadThemePreference();
|
||
loadScalePreference();
|
||
await loadOverview();
|
||
await loadComparison();
|
||
await loadStatistics();
|
||
await loadIntelligenceMetrics();
|
||
populateModelSelector();
|
||
}
|
||
|
||
async function loadOverview() {
|
||
try {
|
||
const response = await axios.get('/api/comparison');
|
||
comparisonData = response.data;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const totalTests = models.reduce((sum, model) =>
|
||
sum + comparisonData.models[model].metadata.total_tests, 0);
|
||
const avgScore = models.reduce((sum, model) =>
|
||
sum + (comparisonData.models[model].overall_stats.average || 0), 0) / models.length;
|
||
|
||
const statsHtml = `
|
||
<div class="stat-card">
|
||
<h3>Models Evaluated</h3>
|
||
<div class="value">${models.length}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Total Tests</h3>
|
||
<div class="value">${totalTests}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Average Score</h3>
|
||
<div class="value">${avgScore.toFixed(2)}</div>
|
||
</div>
|
||
<div class="stat-card">
|
||
<h3>Categories</h3>
|
||
<div class="value">${comparisonData.categories.length}</div>
|
||
</div>
|
||
`;
|
||
|
||
document.getElementById('overviewStats').innerHTML = statsHtml;
|
||
|
||
// Create overview chart
|
||
refreshOverviewChart();
|
||
|
||
} catch (error) {
|
||
console.error('Error loading overview:', error);
|
||
}
|
||
}
|
||
|
||
function refreshOverviewChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const data = models.map(m => comparisonData.models[m].overall_stats.average || 0);
|
||
|
||
if (overviewChartInstance) {
|
||
overviewChartInstance.destroy();
|
||
}
|
||
|
||
const ctx = document.getElementById('overviewChart').getContext('2d');
|
||
overviewChartInstance = new Chart(ctx, {
|
||
type: 'bar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: 'Average Score',
|
||
data: data,
|
||
backgroundColor: 'rgba(102, 126, 234, 0.6)',
|
||
borderColor: 'rgba(102, 126, 234, 1)',
|
||
borderWidth: 2
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data)
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadComparison() {
|
||
updateComparisonChart();
|
||
}
|
||
|
||
async function updateComparisonChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const metric = document.getElementById('metricSelect').value;
|
||
const models = Object.keys(comparisonData.models);
|
||
|
||
let data, label;
|
||
|
||
if (metric === 'consistency' || metric === 'robustness') {
|
||
if (!statisticsData) {
|
||
await loadStatistics();
|
||
}
|
||
const index = statisticsData.models.indexOf(models[0]);
|
||
data = models.map((m, i) => statisticsData[metric + '_score'][i]);
|
||
label = metric.charAt(0).toUpperCase() + metric.slice(1) + ' Score';
|
||
} else {
|
||
data = models.map(m => comparisonData.models[m].overall_stats[metric] || 0);
|
||
label = metric.split('_').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
|
||
}
|
||
|
||
const ctx = document.getElementById('comparisonChart');
|
||
if (window.comparisonChartInstance) {
|
||
window.comparisonChartInstance.destroy();
|
||
}
|
||
|
||
window.comparisonChartInstance = new Chart(ctx, {
|
||
type: 'radar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: label,
|
||
data: data,
|
||
backgroundColor: 'rgba(118, 75, 162, 0.2)',
|
||
borderColor: 'rgba(118, 75, 162, 1)',
|
||
pointBackgroundColor: 'rgba(118, 75, 162, 1)',
|
||
pointBorderColor: '#fff',
|
||
pointHoverBackgroundColor: '#fff',
|
||
pointHoverBorderColor: 'rgba(118, 75, 162, 1)'
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data, true)
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadStatistics() {
|
||
try {
|
||
const response = await axios.get('/api/statistics');
|
||
statisticsData = response.data;
|
||
} catch (error) {
|
||
console.error('Error loading statistics:', error);
|
||
}
|
||
}
|
||
|
||
async function loadIntelligenceMetrics() {
|
||
try {
|
||
const response = await axios.get('/api/intelligence_metrics');
|
||
intelligenceData = response.data;
|
||
|
||
let html = '';
|
||
|
||
for (const [model, metrics] of Object.entries(intelligenceData)) {
|
||
html += `
|
||
<div class="metric-card">
|
||
<h3>${model}</h3>
|
||
|
||
<div style="margin-bottom: 20px;" class="tooltip">
|
||
<strong>Overall Intelligence Score:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Calculation:</strong><br>
|
||
Overall = (IQ × 0.5) + (Adaptability × 0.3) + (Problem-Solving × 0.2)<br><br>
|
||
<strong>Values:</strong><br>
|
||
• IQ: ${metrics.iq_score.toFixed(1)}<br>
|
||
• Adaptability: ${metrics.adaptability.toFixed(1)}%<br>
|
||
• Problem-Solving: ${metrics.problem_solving_depth.toFixed(1)}<br><br>
|
||
Result: ${metrics.overall_intelligence.toFixed(1)}
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.overall_intelligence}%">
|
||
${metrics.overall_intelligence.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 15px;">
|
||
<div class="tooltip">
|
||
<strong>IQ Score:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Weighted Average of Dimensions:</strong><br><br>
|
||
${Object.entries(metrics.dimensions).map(([dim, data]) => {
|
||
const weights = {
|
||
'logical_reasoning': 1.5,
|
||
'mathematical_ability': 1.3,
|
||
'technical_knowledge': 1.4,
|
||
'instruction_following': 1.2,
|
||
'linguistic_nuance': 1.1,
|
||
'creativity': 1.0,
|
||
'conversational_depth': 1.0
|
||
};
|
||
return `• ${dim.replace(/_/g, ' ')}: ${data.score.toFixed(1)} × ${weights[dim] || 1.0}`;
|
||
}).join('<br>')}<br><br>
|
||
Normalized to 0-100 scale
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.iq_score}%">
|
||
${metrics.iq_score.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="tooltip">
|
||
<strong>Adaptability:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Cross-Category Performance:</strong><br><br>
|
||
Measures versatility across different task types.<br><br>
|
||
Formula: (Categories with avg ≥ 2.5) / (Total categories) × 100<br><br>
|
||
Higher score = more versatile model
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.adaptability}%">
|
||
${metrics.adaptability.toFixed(1)}%
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<div class="tooltip">
|
||
<strong>Problem-Solving Depth:</strong>
|
||
<span class="tooltiptext">
|
||
<strong>Performance on Challenging Tasks:</strong><br><br>
|
||
Average score on "hard" and "very_hard" difficulty tests.<br><br>
|
||
Formula: (Avg score on hard tests) × 20<br><br>
|
||
Tests critical thinking and complex reasoning
|
||
</span>
|
||
<div class="progress-bar">
|
||
<div class="progress-fill" style="width: ${metrics.problem_solving_depth}%">
|
||
${metrics.problem_solving_depth.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
<h4 style="margin-top: 20px; color: #764ba2;">Cognitive Dimensions:</h4>
|
||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 10px; margin-top: 10px;">
|
||
`;
|
||
|
||
const dimensionWeights = {
|
||
'logical_reasoning': 1.5,
|
||
'mathematical_ability': 1.3,
|
||
'technical_knowledge': 1.4,
|
||
'instruction_following': 1.2,
|
||
'linguistic_nuance': 1.1,
|
||
'creativity': 1.0,
|
||
'conversational_depth': 1.0
|
||
};
|
||
|
||
for (const [dim, data] of Object.entries(metrics.dimensions)) {
|
||
const weight = dimensionWeights[dim] || 1.0;
|
||
html += `
|
||
<div class="tooltip">
|
||
<small>${dim.replace(/_/g, ' ').toUpperCase()}</small>
|
||
<span class="tooltiptext">
|
||
<strong>${dim.replace(/_/g, ' ').toUpperCase()}</strong><br><br>
|
||
Score: <code>${data.score.toFixed(2)}/5.00</code><br>
|
||
Weight in IQ: <code>${weight}</code><br>
|
||
Tests evaluated: <code>${data.count}</code><br><br>
|
||
Normalized: ${data.normalized.toFixed(1)}%
|
||
</span>
|
||
<div class="progress-bar" style="height: 20px;">
|
||
<div class="progress-fill" style="width: ${data.normalized}%; font-size: 0.8em;">
|
||
${data.score.toFixed(1)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
`;
|
||
}
|
||
|
||
html += `
|
||
</div>
|
||
</div>
|
||
`;
|
||
}
|
||
|
||
document.getElementById('intelligenceMetrics').innerHTML = html;
|
||
|
||
} catch (error) {
|
||
console.error('Error loading intelligence metrics:', error);
|
||
document.getElementById('intelligenceMetrics').innerHTML =
|
||
'<p class="loading">Error loading intelligence metrics</p>';
|
||
}
|
||
}
|
||
|
||
function populateModelSelector() {
|
||
if (!comparisonData) return;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const select = document.getElementById('modelSelect');
|
||
|
||
select.innerHTML = '<option value="">Select a model...</option>';
|
||
models.forEach(model => {
|
||
const option = document.createElement('option');
|
||
option.value = model;
|
||
option.textContent = model;
|
||
select.appendChild(option);
|
||
});
|
||
|
||
// Populate category filter
|
||
const categoryFilter = document.getElementById('filterCategory');
|
||
categoryFilter.innerHTML = '<option value="">All Categories</option>';
|
||
comparisonData.categories.forEach(cat => {
|
||
const option = document.createElement('option');
|
||
option.value = cat;
|
||
option.textContent = cat;
|
||
categoryFilter.appendChild(option);
|
||
});
|
||
|
||
// Populate category chart selector
|
||
const categorySelect = document.getElementById('categorySelect');
|
||
categorySelect.innerHTML = '';
|
||
comparisonData.categories.forEach(cat => {
|
||
const option = document.createElement('option');
|
||
option.value = cat;
|
||
option.textContent = cat;
|
||
categorySelect.appendChild(option);
|
||
});
|
||
|
||
if (comparisonData.categories.length > 0) {
|
||
initializeModelToggles();
|
||
updateCategoryChart();
|
||
}
|
||
}
|
||
|
||
// Category chart type state
|
||
let categoryChartType = 'bar';
|
||
let selectedModelsForRadar = {};
|
||
|
||
// Model colors for radar chart
|
||
const modelColors = [
|
||
{ bg: 'rgba(102, 126, 234, 0.3)', border: 'rgba(102, 126, 234, 1)' },
|
||
{ bg: 'rgba(118, 75, 162, 0.3)', border: 'rgba(118, 75, 162, 1)' },
|
||
{ bg: 'rgba(16, 185, 129, 0.3)', border: 'rgba(16, 185, 129, 1)' },
|
||
{ bg: 'rgba(245, 158, 11, 0.3)', border: 'rgba(245, 158, 11, 1)' },
|
||
{ bg: 'rgba(239, 68, 68, 0.3)', border: 'rgba(239, 68, 68, 1)' },
|
||
{ bg: 'rgba(59, 130, 246, 0.3)', border: 'rgba(59, 130, 246, 1)' },
|
||
{ bg: 'rgba(236, 72, 153, 0.3)', border: 'rgba(236, 72, 153, 1)' },
|
||
{ bg: 'rgba(34, 197, 94, 0.3)', border: 'rgba(34, 197, 94, 1)' },
|
||
{ bg: 'rgba(168, 85, 247, 0.3)', border: 'rgba(168, 85, 247, 1)' },
|
||
{ bg: 'rgba(251, 146, 60, 0.3)', border: 'rgba(251, 146, 60, 1)' }
|
||
];
|
||
|
||
function initializeModelToggles() {
|
||
if (!comparisonData) return;
|
||
|
||
const models = Object.keys(comparisonData.models);
|
||
const container = document.getElementById('modelToggleContainer');
|
||
container.innerHTML = '';
|
||
|
||
models.forEach((model, index) => {
|
||
const colorIndex = index % modelColors.length;
|
||
selectedModelsForRadar[model] = true; // All selected by default
|
||
|
||
const item = document.createElement('label');
|
||
item.className = 'model-toggle-item active';
|
||
item.style.color = modelColors[colorIndex].border;
|
||
item.innerHTML = `
|
||
<span class="model-color-indicator" style="background: ${modelColors[colorIndex].border}"></span>
|
||
<input type="checkbox" class="model-toggle-checkbox" checked
|
||
onchange="toggleModelVisibility('${model}', this.checked, this.parentElement)">
|
||
<span class="model-toggle-label">${model}</span>
|
||
`;
|
||
container.appendChild(item);
|
||
});
|
||
}
|
||
|
||
function toggleModelVisibility(model, isVisible, element) {
|
||
selectedModelsForRadar[model] = isVisible;
|
||
element.classList.toggle('active', isVisible);
|
||
updateCategoryChart();
|
||
}
|
||
|
||
function setCategoryChartType(type) {
|
||
categoryChartType = type;
|
||
|
||
// Update button states
|
||
document.getElementById('barChartBtn').classList.toggle('active', type === 'bar');
|
||
document.getElementById('radarChartBtn').classList.toggle('active', type === 'radar');
|
||
|
||
// Show/hide radar controls
|
||
document.getElementById('radarModelToggles').classList.toggle('visible', type === 'radar');
|
||
|
||
// Show/hide category selector (hide for radar since it shows all categories)
|
||
document.getElementById('categorySelect').style.display = type === 'radar' ? 'none' : 'block';
|
||
|
||
updateCategoryChart();
|
||
}
|
||
|
||
function updateCategoryChart() {
|
||
if (!comparisonData) return;
|
||
|
||
const ctx = document.getElementById('categoryChart');
|
||
if (window.categoryChartInstance) {
|
||
window.categoryChartInstance.destroy();
|
||
}
|
||
|
||
if (categoryChartType === 'radar') {
|
||
updateRadarChart(ctx);
|
||
} else {
|
||
updateBarChart(ctx);
|
||
}
|
||
}
|
||
|
||
function updateBarChart(ctx) {
|
||
const category = document.getElementById('categorySelect').value;
|
||
const models = Object.keys(comparisonData.models);
|
||
|
||
const data = models.map(model => {
|
||
const stats = comparisonData.models[model].category_stats[category];
|
||
return stats ? stats.average : 0;
|
||
});
|
||
|
||
window.categoryChartInstance = new Chart(ctx, {
|
||
type: 'bar',
|
||
data: {
|
||
labels: models,
|
||
datasets: [{
|
||
label: `${category} - Average Score`,
|
||
data: data,
|
||
backgroundColor: 'rgba(102, 126, 234, 0.6)',
|
||
borderColor: 'rgba(102, 126, 234, 1)',
|
||
borderWidth: 2
|
||
}]
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
scales: getScaleOptions(data)
|
||
}
|
||
});
|
||
}
|
||
|
||
function updateRadarChart(ctx) {
|
||
const categories = comparisonData.categories;
|
||
const models = Object.keys(comparisonData.models);
|
||
|
||
// Create datasets for each selected model
|
||
const datasets = [];
|
||
let allData = [];
|
||
|
||
models.forEach((model, index) => {
|
||
if (!selectedModelsForRadar[model]) return;
|
||
|
||
const colorIndex = index % modelColors.length;
|
||
const data = categories.map(category => {
|
||
const stats = comparisonData.models[model].category_stats[category];
|
||
return stats ? stats.average : 0;
|
||
});
|
||
|
||
allData = allData.concat(data);
|
||
|
||
datasets.push({
|
||
label: model,
|
||
data: data,
|
||
backgroundColor: modelColors[colorIndex].bg,
|
||
borderColor: modelColors[colorIndex].border,
|
||
borderWidth: 2,
|
||
pointBackgroundColor: modelColors[colorIndex].border,
|
||
pointBorderColor: '#fff',
|
||
pointHoverBackgroundColor: '#fff',
|
||
pointHoverBorderColor: modelColors[colorIndex].border,
|
||
pointRadius: 4,
|
||
pointHoverRadius: 6
|
||
});
|
||
});
|
||
|
||
// Handle empty data case
|
||
let minVal = 0;
|
||
let maxVal = 5;
|
||
if (allData.length > 0 && zoomedScale) {
|
||
minVal = Math.max(0, Math.min(...allData) - 0.5);
|
||
maxVal = Math.min(5, Math.max(...allData) + 0.5);
|
||
}
|
||
|
||
window.categoryChartInstance = new Chart(ctx, {
|
||
type: 'radar',
|
||
data: {
|
||
labels: categories,
|
||
datasets: datasets
|
||
},
|
||
options: {
|
||
responsive: true,
|
||
maintainAspectRatio: false,
|
||
plugins: {
|
||
legend: {
|
||
position: 'top',
|
||
labels: {
|
||
padding: 20,
|
||
usePointStyle: true,
|
||
pointStyle: 'circle'
|
||
}
|
||
},
|
||
tooltip: {
|
||
callbacks: {
|
||
label: function(context) {
|
||
return `${context.dataset.label}: ${context.raw.toFixed(2)}`;
|
||
}
|
||
}
|
||
}
|
||
},
|
||
scales: {
|
||
r: {
|
||
beginAtZero: !zoomedScale,
|
||
min: zoomedScale ? minVal : 0,
|
||
max: zoomedScale ? maxVal : 5,
|
||
ticks: {
|
||
stepSize: zoomedScale ? 0.5 : 1,
|
||
backdropColor: 'transparent'
|
||
},
|
||
grid: {
|
||
color: 'rgba(102, 126, 234, 0.2)'
|
||
},
|
||
angleLines: {
|
||
color: 'rgba(102, 126, 234, 0.2)'
|
||
},
|
||
pointLabels: {
|
||
font: {
|
||
size: 11
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
async function loadModelDetails() {
|
||
const modelName = document.getElementById('modelSelect').value;
|
||
if (!modelName || !comparisonData) return;
|
||
|
||
currentModelDetails = comparisonData.models[modelName].test_results;
|
||
displayDetailsTable(currentModelDetails);
|
||
}
|
||
|
||
function displayDetailsTable(results) {
|
||
let html = `
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th onclick="sortTable('test_name')">Test Name</th>
|
||
<th onclick="sortTable('category')">Category</th>
|
||
<th onclick="sortTable('difficulty')">Difficulty</th>
|
||
<th onclick="sortTable('score')">Score</th>
|
||
<th onclick="sortTable('generation_time')">Time (s)</th>
|
||
<th onclick="sortTable('tokens')">Tokens</th>
|
||
<th onclick="sortTable('status')">Status</th>
|
||
<th>Notes</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
`;
|
||
|
||
results.forEach(test => {
|
||
const scoreClass = test.score >= 4 ? 'exceptional' : test.score >= 2 ? 'pass' : 'fail';
|
||
const scoreDisplay = test.score !== null ? test.score.toFixed(1) : 'N/A';
|
||
|
||
// Extract timing and token info
|
||
const genTime = test.generation_time ? test.generation_time.toFixed(2) : 'N/A';
|
||
let tokenInfo = 'N/A';
|
||
let tokensPerSec = '';
|
||
|
||
if (test.api_metrics && test.api_metrics.usage) {
|
||
const usage = test.api_metrics.usage;
|
||
const totalTokens = usage.total_tokens || usage.eval_count || 'N/A';
|
||
const completionTokens = usage.completion_tokens || usage.eval_count;
|
||
|
||
if (totalTokens !== 'N/A') {
|
||
tokenInfo = totalTokens.toString();
|
||
|
||
// Calculate tokens/sec if we have both values
|
||
if (test.generation_time && completionTokens) {
|
||
const tps = completionTokens / test.generation_time;
|
||
tokensPerSec = `<br><small>(${tps.toFixed(1)} t/s)</small>`;
|
||
}
|
||
}
|
||
}
|
||
|
||
html += `
|
||
<tr>
|
||
<td><strong>${test.test_name}</strong></td>
|
||
<td>${test.category}</td>
|
||
<td>${test.difficulty}</td>
|
||
<td><span class="score-badge score-${scoreClass}">${scoreDisplay}</span></td>
|
||
<td>${genTime}</td>
|
||
<td>${tokenInfo}${tokensPerSec}</td>
|
||
<td>${test.status}</td>
|
||
<td><small>${test.notes || ''}</small></td>
|
||
</tr>
|
||
`;
|
||
});
|
||
|
||
html += '</tbody></table>';
|
||
document.getElementById('detailsTable').innerHTML = html;
|
||
}
|
||
|
||
function filterTable() {
|
||
if (!currentModelDetails) return;
|
||
|
||
const searchTerm = document.getElementById('searchInput').value.toLowerCase();
|
||
const categoryFilter = document.getElementById('filterCategory').value;
|
||
const scoreFilter = document.getElementById('filterScore').value;
|
||
|
||
const filtered = currentModelDetails.filter(test => {
|
||
const matchesSearch = test.test_name.toLowerCase().includes(searchTerm) ||
|
||
test.category.toLowerCase().includes(searchTerm);
|
||
const matchesCategory = !categoryFilter || test.category === categoryFilter;
|
||
|
||
let matchesScore = true;
|
||
if (scoreFilter === 'exceptional') matchesScore = test.score >= 4;
|
||
else if (scoreFilter === 'pass') matchesScore = test.score >= 2 && test.score < 4;
|
||
else if (scoreFilter === 'fail') matchesScore = test.score < 2;
|
||
|
||
return matchesSearch && matchesCategory && matchesScore;
|
||
});
|
||
|
||
displayDetailsTable(filtered);
|
||
}
|
||
|
||
function sortTable(column) {
|
||
if (!currentModelDetails) return;
|
||
|
||
currentModelDetails.sort((a, b) => {
|
||
if (column === 'score') {
|
||
return (b[column] || 0) - (a[column] || 0);
|
||
}
|
||
return (a[column] || '').toString().localeCompare((b[column] || '').toString());
|
||
});
|
||
|
||
filterTable();
|
||
}
|
||
|
||
// Initialize on load
|
||
initDashboard();
|
||
</script>
|
||
</body>
|
||
</html> |