diff --git a/analyze_results.py b/analyze_results.py index 4904db3..188f9eb 100644 --- a/analyze_results.py +++ b/analyze_results.py @@ -362,6 +362,26 @@ class WebInterface: test_results = [] for test in results.get('test_results', []): score = test.get('score') or test.get('overall_score') + + # Handle notes differently for multi-turn vs single-turn tests + if test.get('type') == 'multi_turn' and 'turns' in test: + # Combine notes from all turns for multi-turn tests + turn_notes = [] + for turn in test.get('turns', []): + turn_num = turn.get('turn', '?') + turn_note = turn.get('notes', '') + if turn_note: + turn_notes.append(f"T{turn_num}: {turn_note}") + notes = ' | '.join(turn_notes) if turn_notes else '' + + # Get aggregate generation time and metrics for multi-turn + total_gen_time = sum(t.get('generation_time', 0) for t in test.get('turns', [])) + api_metrics = test.get('aggregate_metrics', {}) + else: + notes = test.get('notes', '') + total_gen_time = test.get('generation_time') + api_metrics = test.get('api_metrics') + test_data = { 'test_id': test.get('test_id'), 'test_name': test.get('test_name'), @@ -370,7 +390,9 @@ class WebInterface: 'difficulty': test.get('difficulty', 'medium'), 'score': score, 'status': test.get('status'), - 'notes': test.get('notes', '') + 'notes': notes, + 'generation_time': total_gen_time, + 'api_metrics': api_metrics } test_results.append(test_data) @@ -688,6 +710,29 @@ class WebInterface: box-shadow: 0 4px 15px var(--shadow-hover); } + .scale-toggle { + position: absolute; + top: 30px; + right: 140px; + background: var(--border-color); + border: none; + padding: 10px 20px; + border-radius: 20px; + cursor: pointer; + font-size: 1em; + transition: all 0.3s; + } + + .scale-toggle:hover { + transform: scale(1.05); + box-shadow: 0 4px 15px var(--shadow-hover); + } + + .scale-toggle.zoomed { + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + color: white; + } + h1 { font-size: 2.5em; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); @@ -1000,6 +1045,7 @@ class WebInterface:
Comprehensive Intelligence & Performance Analysis
@@ -1095,6 +1141,8 @@ class WebInterface: let statisticsData = null; let intelligenceData = null; let currentModelDetails = null; + let zoomedScale = false; + let overviewChartInstance = null; // Theme toggle functionality function toggleTheme() { @@ -1111,6 +1159,81 @@ class WebInterface: } } + // Scale toggle functionality + function toggleScale() { + zoomedScale = !zoomedScale; + const btn = document.getElementById('scaleToggle'); + if (zoomedScale) { + btn.textContent = '🔎 Zoomed'; + btn.classList.add('zoomed'); + } else { + btn.textContent = '🔍 Full Scale'; + btn.classList.remove('zoomed'); + } + localStorage.setItem('zoomedScale', zoomedScale ? 'enabled' : 'disabled'); + // Refresh all charts with new scale + refreshAllCharts(); + } + + // Load scale preference + function loadScalePreference() { + const savedScale = localStorage.getItem('zoomedScale'); + if (savedScale === 'enabled') { + zoomedScale = true; + const btn = document.getElementById('scaleToggle'); + btn.textContent = '🔎 Zoomed'; + btn.classList.add('zoomed'); + } + } + + // Calculate optimal Y-axis range for zoomed view + function getScaleOptions(data, isRadar = false) { + if (!zoomedScale) { + // Full scale: 0 to 5 + if (isRadar) { + return { r: { beginAtZero: true, max: 5 } }; + } + return { y: { beginAtZero: true, max: 5 } }; + } + + // Zoomed scale: calculate min/max with padding + const validData = data.filter(d => d !== null && d !== undefined && !isNaN(d)); + if (validData.length === 0) { + if (isRadar) { + return { r: { beginAtZero: true, max: 5 } }; + } + return { y: { beginAtZero: true, max: 5 } }; + } + + const minVal = Math.min(...validData); + const maxVal = Math.max(...validData); + const range = maxVal - minVal; + const padding = Math.max(range * 0.2, 0.2); // At least 0.2 padding + + let min = Math.max(0, Math.floor((minVal - padding) * 10) / 10); + let max = Math.min(5, Math.ceil((maxVal + padding) * 10) / 10); + + // Ensure we have at least some range + if (max - min < 0.5) { + min = Math.max(0, minVal - 0.3); + max = Math.min(5, maxVal + 0.3); + } + + if (isRadar) { + return { r: { min: min, max: max, beginAtZero: false } }; + } + return { y: { min: min, max: max, beginAtZero: false } }; + } + + // Refresh all charts when scale changes + function refreshAllCharts() { + if (comparisonData) { + refreshOverviewChart(); + updateComparisonChart(); + updateCategoryChart(); + } + } + // Tab switching function switchTab(tabName) { document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); @@ -1123,6 +1246,7 @@ class WebInterface: // Initialize dashboard async function initDashboard() { loadThemePreference(); + loadScalePreference(); await loadOverview(); await loadComparison(); await loadStatistics(); @@ -1163,30 +1287,42 @@ class WebInterface: document.getElementById('overviewStats').innerHTML = statsHtml; // Create overview chart - const ctx = document.getElementById('overviewChart').getContext('2d'); - new Chart(ctx, { - type: 'bar', - data: { - labels: models, - datasets: [{ - label: 'Average Score', - data: models.map(m => comparisonData.models[m].overall_stats.average || 0), - backgroundColor: 'rgba(102, 126, 234, 0.6)', - borderColor: 'rgba(102, 126, 234, 1)', - borderWidth: 2 - }] - }, - options: { - responsive: true, - maintainAspectRatio: false, - scales: { - y: { - beginAtZero: true, - max: 5 - } - } - } - }); + refreshOverviewChart(); + + } catch (error) { + console.error('Error loading overview:', error); + } + } + + function refreshOverviewChart() { + if (!comparisonData) return; + + const models = Object.keys(comparisonData.models); + const data = models.map(m => comparisonData.models[m].overall_stats.average || 0); + + if (overviewChartInstance) { + overviewChartInstance.destroy(); + } + + const ctx = document.getElementById('overviewChart').getContext('2d'); + overviewChartInstance = new Chart(ctx, { + type: 'bar', + data: { + labels: models, + datasets: [{ + label: 'Average Score', + data: data, + backgroundColor: 'rgba(102, 126, 234, 0.6)', + borderColor: 'rgba(102, 126, 234, 1)', + borderWidth: 2 + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + scales: getScaleOptions(data) + } + }); } catch (error) { console.error('Error loading overview:', error); @@ -1240,11 +1376,7 @@ class WebInterface: options: { responsive: true, maintainAspectRatio: false, - scales: { - r: { - beginAtZero: true - } - } + scales: getScaleOptions(data, true) } }); } @@ -1465,12 +1597,7 @@ class WebInterface: options: { responsive: true, maintainAspectRatio: false, - scales: { - y: { - beginAtZero: true, - max: 5 - } - } + scales: getScaleOptions(data) } }); } @@ -1535,7 +1662,7 @@ class WebInterface:Comprehensive Intelligence & Performance Analysis