improvements to graphs

This commit is contained in:
overcuriousity
2026-01-17 23:12:24 +01:00
parent 7a76f60bfe
commit 1fc952c811
2 changed files with 305 additions and 75 deletions

View File

@@ -362,6 +362,26 @@ class WebInterface:
test_results = [] test_results = []
for test in results.get('test_results', []): for test in results.get('test_results', []):
score = test.get('score') or test.get('overall_score') score = test.get('score') or test.get('overall_score')
# Handle notes differently for multi-turn vs single-turn tests
if test.get('type') == 'multi_turn' and 'turns' in test:
# Combine notes from all turns for multi-turn tests
turn_notes = []
for turn in test.get('turns', []):
turn_num = turn.get('turn', '?')
turn_note = turn.get('notes', '')
if turn_note:
turn_notes.append(f"T{turn_num}: {turn_note}")
notes = ' | '.join(turn_notes) if turn_notes else ''
# Get aggregate generation time and metrics for multi-turn
total_gen_time = sum(t.get('generation_time', 0) for t in test.get('turns', []))
api_metrics = test.get('aggregate_metrics', {})
else:
notes = test.get('notes', '')
total_gen_time = test.get('generation_time')
api_metrics = test.get('api_metrics')
test_data = { test_data = {
'test_id': test.get('test_id'), 'test_id': test.get('test_id'),
'test_name': test.get('test_name'), 'test_name': test.get('test_name'),
@@ -370,7 +390,9 @@ class WebInterface:
'difficulty': test.get('difficulty', 'medium'), 'difficulty': test.get('difficulty', 'medium'),
'score': score, 'score': score,
'status': test.get('status'), 'status': test.get('status'),
'notes': test.get('notes', '') 'notes': notes,
'generation_time': total_gen_time,
'api_metrics': api_metrics
} }
test_results.append(test_data) test_results.append(test_data)
@@ -688,6 +710,29 @@ class WebInterface:
box-shadow: 0 4px 15px var(--shadow-hover); box-shadow: 0 4px 15px var(--shadow-hover);
} }
.scale-toggle {
position: absolute;
top: 30px;
right: 140px;
background: var(--border-color);
border: none;
padding: 10px 20px;
border-radius: 20px;
cursor: pointer;
font-size: 1em;
transition: all 0.3s;
}
.scale-toggle:hover {
transform: scale(1.05);
box-shadow: 0 4px 15px var(--shadow-hover);
}
.scale-toggle.zoomed {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
h1 { h1 {
font-size: 2.5em; font-size: 2.5em;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
@@ -1000,6 +1045,7 @@ class WebInterface:
<body> <body>
<div class="container"> <div class="container">
<header> <header>
<button class="scale-toggle" id="scaleToggle" onclick="toggleScale()" title="Toggle between full scale (0-5) and zoomed view for better distinction">🔍 Full Scale</button>
<button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button> <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
<h1>🧠 LLM Evaluation Dashboard</h1> <h1>🧠 LLM Evaluation Dashboard</h1>
<p class="subtitle">Comprehensive Intelligence & Performance Analysis</p> <p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
@@ -1095,6 +1141,8 @@ class WebInterface:
let statisticsData = null; let statisticsData = null;
let intelligenceData = null; let intelligenceData = null;
let currentModelDetails = null; let currentModelDetails = null;
let zoomedScale = false;
let overviewChartInstance = null;
// Theme toggle functionality // Theme toggle functionality
function toggleTheme() { function toggleTheme() {
@@ -1111,6 +1159,81 @@ class WebInterface:
} }
} }
// Scale toggle functionality
function toggleScale() {
zoomedScale = !zoomedScale;
const btn = document.getElementById('scaleToggle');
if (zoomedScale) {
btn.textContent = '🔎 Zoomed';
btn.classList.add('zoomed');
} else {
btn.textContent = '🔍 Full Scale';
btn.classList.remove('zoomed');
}
localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled');
// Refresh all charts with new scale
refreshAllCharts();
}
// Load scale preference
function loadScalePreference() {
const savedScale = localStorage.getItem('zoomedScale');
if (savedScale === 'enabled') {
zoomedScale = true;
const btn = document.getElementById('scaleToggle');
btn.textContent = '🔎 Zoomed';
btn.classList.add('zoomed');
}
}
// Calculate optimal Y-axis range for zoomed view
function getScaleOptions(data, isRadar = false) {
if (!zoomedScale) {
// Full scale: 0 to 5
if (isRadar) {
return { r: { beginAtZero: true, max: 5 } };
}
return { y: { beginAtZero: true, max: 5 } };
}
// Zoomed scale: calculate min/max with padding
const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d));
if (validData.length === 0) {
if (isRadar) {
return { r: { beginAtZero: true, max: 5 } };
}
return { y: { beginAtZero: true, max: 5 } };
}
const minVal = Math.min(...validData);
const maxVal = Math.max(...validData);
const range = maxVal - minVal;
const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding
let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10);
let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10);
// Ensure we have at least some range
if (max - min < 0.5) {
min = Math.max(0, minVal - 0.3);
max = Math.min(5, maxVal + 0.3);
}
if (isRadar) {
return { r: { min: min, max: max, beginAtZero: false } };
}
return { y: { min: min, max: max, beginAtZero: false } };
}
// Refresh all charts when scale changes
function refreshAllCharts() {
if (comparisonData) {
refreshOverviewChart();
updateComparisonChart();
updateCategoryChart();
}
}
// Tab switching // Tab switching
function switchTab(tabName) { function switchTab(tabName) {
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
@@ -1123,6 +1246,7 @@ class WebInterface:
// Initialize dashboard // Initialize dashboard
async function initDashboard() { async function initDashboard() {
loadThemePreference(); loadThemePreference();
loadScalePreference();
await loadOverview(); await loadOverview();
await loadComparison(); await loadComparison();
await loadStatistics(); await loadStatistics();
@@ -1163,14 +1287,31 @@ class WebInterface:
document.getElementById('overviewStats').innerHTML = statsHtml; document.getElementById('overviewStats').innerHTML = statsHtml;
// Create overview chart // Create overview chart
refreshOverviewChart();
} catch (error) {
console.error('Error loading overview:', error);
}
}
function refreshOverviewChart() {
if (!comparisonData) return;
const models = Object.keys(comparisonData.models);
const data = models.map(m => comparisonData.models[m].overall_stats.average || 0);
if (overviewChartInstance) {
overviewChartInstance.destroy();
}
const ctx = document.getElementById('overviewChart').getContext('2d'); const ctx = document.getElementById('overviewChart').getContext('2d');
new Chart(ctx, { overviewChartInstance = new Chart(ctx, {
type: 'bar', type: 'bar',
data: { data: {
labels: models, labels: models,
datasets: [{ datasets: [{
label: 'Average Score', label: 'Average Score',
data: models.map(m => comparisonData.models[m].overall_stats.average || 0), data: data,
backgroundColor: 'rgba(102, 126, 234, 0.6)', backgroundColor: 'rgba(102, 126, 234, 0.6)',
borderColor: 'rgba(102, 126, 234, 1)', borderColor: 'rgba(102, 126, 234, 1)',
borderWidth: 2 borderWidth: 2
@@ -1179,12 +1320,7 @@ class WebInterface:
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data)
y: {
beginAtZero: true,
max: 5
}
}
} }
}); });
@@ -1240,11 +1376,7 @@ class WebInterface:
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data, true)
r: {
beginAtZero: true
}
}
} }
}); });
} }
@@ -1465,12 +1597,7 @@ class WebInterface:
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data)
y: {
beginAtZero: true,
max: 5
}
}
} }
}); });
} }
@@ -1535,7 +1662,7 @@ class WebInterface:
<td>${genTime}</td> <td>${genTime}</td>
<td>${tokenInfo}${tokensPerSec}</td> <td>${tokenInfo}${tokensPerSec}</td>
<td>${test.status}</td> <td>${test.status}</td>
<td><small>${test.notes}</small></td> <td><small>${test.notes || ''}</small></td>
</tr> </tr>
`; `;
}); });

View File

@@ -78,6 +78,29 @@
box-shadow: 0 4px 15px var(--shadow-hover); box-shadow: 0 4px 15px var(--shadow-hover);
} }
.scale-toggle {
position: absolute;
top: 30px;
right: 140px;
background: var(--border-color);
border: none;
padding: 10px 20px;
border-radius: 20px;
cursor: pointer;
font-size: 1em;
transition: all 0.3s;
}
.scale-toggle:hover {
transform: scale(1.05);
box-shadow: 0 4px 15px var(--shadow-hover);
}
.scale-toggle.zoomed {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
h1 { h1 {
font-size: 2.5em; font-size: 2.5em;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
@@ -389,8 +412,7 @@
</head> </head>
<body> <body>
<div class="container"> <div class="container">
<header> <header> <button class="scale-toggle" id="scaleToggle" onclick="toggleScale()" title="Toggle between full scale (0-5) and zoomed view for better distinction">🔍 Full Scale</button> <button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
<button class="theme-toggle" onclick="toggleTheme()">🌓 Toggle Dark Mode</button>
<h1>🧠 LLM Evaluation Dashboard</h1> <h1>🧠 LLM Evaluation Dashboard</h1>
<p class="subtitle">Comprehensive Intelligence & Performance Analysis</p> <p class="subtitle">Comprehensive Intelligence & Performance Analysis</p>
</header> </header>
@@ -485,6 +507,8 @@
let statisticsData = null; let statisticsData = null;
let intelligenceData = null; let intelligenceData = null;
let currentModelDetails = null; let currentModelDetails = null;
let zoomedScale = false;
let overviewChartInstance = null;
// Theme toggle functionality // Theme toggle functionality
function toggleTheme() { function toggleTheme() {
@@ -501,6 +525,81 @@
} }
} }
// Scale toggle functionality
function toggleScale() {
zoomedScale = !zoomedScale;
const btn = document.getElementById('scaleToggle');
if (zoomedScale) {
btn.textContent = '🔎 Zoomed';
btn.classList.add('zoomed');
} else {
btn.textContent = '🔍 Full Scale';
btn.classList.remove('zoomed');
}
localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled');
// Refresh all charts with new scale
refreshAllCharts();
}
// Load scale preference
function loadScalePreference() {
const savedScale = localStorage.getItem('zoomedScale');
if (savedScale === 'enabled') {
zoomedScale = true;
const btn = document.getElementById('scaleToggle');
btn.textContent = '🔎 Zoomed';
btn.classList.add('zoomed');
}
}
// Calculate optimal Y-axis range for zoomed view
function getScaleOptions(data, isRadar = false) {
if (!zoomedScale) {
// Full scale: 0 to 5
if (isRadar) {
return { r: { beginAtZero: true, max: 5 } };
}
return { y: { beginAtZero: true, max: 5 } };
}
// Zoomed scale: calculate min/max with padding
const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d));
if (validData.length === 0) {
if (isRadar) {
return { r: { beginAtZero: true, max: 5 } };
}
return { y: { beginAtZero: true, max: 5 } };
}
const minVal = Math.min(...validData);
const maxVal = Math.max(...validData);
const range = maxVal - minVal;
const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding
let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10);
let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10);
// Ensure we have at least some range
if (max - min < 0.5) {
min = Math.max(0, minVal - 0.3);
max = Math.min(5, maxVal + 0.3);
}
if (isRadar) {
return { r: { min: min, max: max, beginAtZero: false } };
}
return { y: { min: min, max: max, beginAtZero: false } };
}
// Refresh all charts when scale changes
function refreshAllCharts() {
if (comparisonData) {
refreshOverviewChart();
updateComparisonChart();
updateCategoryChart();
}
}
// Tab switching // Tab switching
function switchTab(tabName) { function switchTab(tabName) {
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
@@ -513,6 +612,7 @@
// Initialize dashboard // Initialize dashboard
async function initDashboard() { async function initDashboard() {
loadThemePreference(); loadThemePreference();
loadScalePreference();
await loadOverview(); await loadOverview();
await loadComparison(); await loadComparison();
await loadStatistics(); await loadStatistics();
@@ -553,14 +653,31 @@
document.getElementById('overviewStats').innerHTML = statsHtml; document.getElementById('overviewStats').innerHTML = statsHtml;
// Create overview chart // Create overview chart
refreshOverviewChart();
} catch (error) {
console.error('Error loading overview:', error);
}
}
function refreshOverviewChart() {
if (!comparisonData) return;
const models = Object.keys(comparisonData.models);
const data = models.map(m => comparisonData.models[m].overall_stats.average || 0);
if (overviewChartInstance) {
overviewChartInstance.destroy();
}
const ctx = document.getElementById('overviewChart').getContext('2d'); const ctx = document.getElementById('overviewChart').getContext('2d');
new Chart(ctx, { overviewChartInstance = new Chart(ctx, {
type: 'bar', type: 'bar',
data: { data: {
labels: models, labels: models,
datasets: [{ datasets: [{
label: 'Average Score', label: 'Average Score',
data: models.map(m => comparisonData.models[m].overall_stats.average || 0), data: data,
backgroundColor: 'rgba(102, 126, 234, 0.6)', backgroundColor: 'rgba(102, 126, 234, 0.6)',
borderColor: 'rgba(102, 126, 234, 1)', borderColor: 'rgba(102, 126, 234, 1)',
borderWidth: 2 borderWidth: 2
@@ -569,12 +686,7 @@
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data)
y: {
beginAtZero: true,
max: 5
}
}
} }
}); });
@@ -630,11 +742,7 @@
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data, true)
r: {
beginAtZero: true
}
}
} }
}); });
} }
@@ -855,12 +963,7 @@
options: { options: {
responsive: true, responsive: true,
maintainAspectRatio: false, maintainAspectRatio: false,
scales: { scales: getScaleOptions(data)
y: {
beginAtZero: true,
max: 5
}
}
} }
}); });
} }
@@ -925,7 +1028,7 @@
<td>${genTime}</td> <td>${genTime}</td>
<td>${tokenInfo}${tokensPerSec}</td> <td>${tokenInfo}${tokensPerSec}</td>
<td>${test.status}</td> <td>${test.status}</td>
<td><small>${test.notes}</small></td> <td><small>${test.notes || ''}</small></td>
</tr> </tr>
`; `;
}); });