<!DOCTYPE html><html lang="en"><head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>A Cookbook for Building Self-Evolving Agents</title>
<script src="https://cdn.tailwindcss.com"></script>
<link rel="preconnect" href="https://fonts.googleapis.com"/>
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin=""/>
<link href="https://fonts.googleapis.com/css2?family=Playfair+Display:ital,wght@0,400;0,600;0,700;1,400;1,600&family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet"/>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"/>
<script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
<script>
tailwind.config = {
theme: {
extend: {
fontFamily: {
'serif': ['Playfair Display', 'serif'],
'sans': ['Inter', 'sans-serif'],
},
colors: {
'primary': '#1a1a1a',
'secondary': '#4a4a4a',
'accent': '#0d9488',
'muted': '#6b7280',
'background': '#fefefe',
'surface': '#f8fafc',
'border': '#e5e7eb',
}
}
}
}
</script>
<style>
.hero-gradient {
background: linear-gradient(135deg, #0f172a 0%, #1e293b 50%, #334155 100%);
}
.text-shadow {
text-shadow: 0 2px 4px rgba(0,0,0,0.3);
}
.glass-effect {
backdrop-filter: blur(10px);
background: rgba(255, 255, 255, 0.1);
border: 1px solid rgba(255, 255, 255, 0.2);
}
.toc-fixed {
position: fixed;
top: 0;
left: 0;
height: 100vh;
width: 280px;
background: #fefefe;
border-right: 1px solid #e5e7eb;
overflow-y: auto;
z-index: 1000;
padding: 2rem 1.5rem;
}
.content-offset {
margin-left: 280px;
}
.toc-link {
transition: all 0.2s ease;
}
.toc-link:hover {
color: #0d9488;
padding-left: 0.5rem;
}
.citation-link {
color: #0d9488;
text-decoration: none;
font-weight: 500;
}
.citation-link:hover {
text-decoration: underline;
}
.section-divider {
background: linear-gradient(90deg, #0d9488 0%, transparent 100%);
height: 2px;
margin: 3rem 0;
}
.mermaid-container {
display: flex;
justify-content: center;
min-height: 300px;
max-height: 800px;
background: #ffffff;
border: 2px solid #e5e7eb;
border-radius: 12px;
padding: 30px;
margin: 30px 0;
box-shadow: 0 8px 25px rgba(0, 0, 0, 0.08);
position: relative;
overflow: hidden;
}
.mermaid-container .mermaid {
width: 100%;
max-width: 100%;
height: 100%;
cursor: grab;
transition: transform 0.3s ease;
transform-origin: center center;
display: flex;
justify-content: center;
align-items: center;
touch-action: none;
-webkit-user-select: none;
-moz-user-select: none;
-ms-user-select: none;
user-select: none;
}
.mermaid-container .mermaid svg {
max-width: 100%;
height: 100%;
display: block;
margin: 0 auto;
}
.mermaid-container .mermaid:active {
cursor: grabbing;
}
.mermaid-container.zoomed .mermaid {
height: 100%;
width: 100%;
cursor: grab;
}
.mermaid-controls {
position: absolute;
top: 15px;
right: 15px;
display: flex;
gap: 10px;
z-index: 20;
background: rgba(255, 255, 255, 0.95);
padding: 8px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
}
.mermaid-control-btn {
background: #ffffff;
border: 1px solid #d1d5db;
border-radius: 6px;
padding: 10px;
cursor: pointer;
transition: all 0.2s ease;
color: #374151;
font-size: 14px;
min-width: 36px;
height: 36px;
text-align: center;
display: flex;
align-items: center;
justify-content: center;
}
.mermaid-control-btn:hover {
background: #f8fafc;
border-color: #3b82f6;
color: #3b82f6;
transform: translateY(-1px);
}
.mermaid-control-btn:active {
transform: scale(0.95);
}
/* Enhanced mermaid styling with better contrast for different node types */
.mermaid svg {
max-width: none !important;
height: auto !important;
font-family: 'Inter', sans-serif !important;
background: transparent !important;
}
/* Primary nodes - teal theme */
.mermaid .node.primary rect,
.mermaid .node.primary circle,
.mermaid .node.primary ellipse,
.mermaid .node.primary polygon {
fill: #0d9488 !important;
stroke: #0f766e !important;
stroke-width: 2.5px !important;
filter: drop-shadow(0 2px 4px rgba(13, 148, 136, 0.3));
}
.mermaid .node.primary .label {
color: #ffffff !important;
font-weight: 600 !important;
font-size: 14px !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.2);
}
/* Secondary nodes - light teal theme */
.mermaid .node.secondary rect,
.mermaid .node.secondary circle,
.mermaid .node.secondary ellipse,
.mermaid .node.secondary polygon {
fill: #5eead4 !important;
stroke: #14b8a6 !important;
stroke-width: 2px !important;
filter: drop-shadow(0 2px 4px rgba(94, 234, 212, 0.3));
}
.mermaid .node.secondary .label {
color: #134e4a !important;
font-weight: 600 !important;
font-size: 13px !important;
}
/* Tertiary nodes - light gray theme */
.mermaid .node.tertiary rect,
.mermaid .node.tertiary circle,
.mermaid .node.tertiary ellipse,
.mermaid .node.tertiary polygon {
fill: #f1f5f9 !important;
stroke: #64748b !important;
stroke-width: 1.5px !important;
filter: drop-shadow(0 2px 4px rgba(100, 116, 139, 0.2));
}
.mermaid .node.tertiary .label {
color: #334155 !important;
font-weight: 500 !important;
font-size: 12px !important;
}
/* Default node styling */
.mermaid .node rect,
.mermaid .node circle,
.mermaid .node ellipse,
.mermaid .node polygon {
fill: #ffffff !important;
stroke: #0d9488 !important;
stroke-width: 2px !important;
filter: drop-shadow(0 2px 4px rgba(13, 148, 136, 0.2));
}
.mermaid .node .label {
color: #1a1a1a !important;
font-weight: 600 !important;
font-size: 13px !important;
text-shadow: 0 1px 2px rgba(255, 255, 255, 0.8);
}
/* Edge styling */
.mermaid .edgePath .path {
stroke: #64748b !important;
stroke-width: 2.5px !important;
filter: drop-shadow(0 1px 2px rgba(100, 116, 139, 0.3));
}
.mermaid .edgeLabel {
background-color: rgba(255, 255, 255, 0.95) !important;
color: #374151 !important;
font-weight: 500 !important;
font-size: 12px !important;
padding: 6px 10px !important;
border-radius: 6px !important;
border: 1px solid #e5e7eb !important;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1) !important;
}
/* Timeline specific styling */
.mermaid .section {
fill: #f1f5f9 !important;
stroke: #d1d5db !important;
}
.mermaid .section0 {
fill: #ecfdf5 !important;
stroke: #10b981 !important;
}
.mermaid .section1 {
fill: #eff6ff !important;
stroke: #3b82f6 !important;
}
.mermaid .section2 {
fill: #fef3c7 !important;
stroke: #f59e0b !important;
}
.mermaid .section3 {
fill: #fce7f3 !important;
stroke: #ec4899 !important;
}
.mermaid .cScale0, .mermaid .cScale1, .mermaid .cScale2, .mermaid .cScale3 {
fill: #0d9488 !important;
stroke: #0f766e !important;
}
.mermaid .cScale0 .label, .mermaid .cScale1 .label, .mermaid .cScale2 .label, .mermaid .cScale3 .label {
color: #ffffff !important;
font-weight: 600 !important;
}
<span class="mention-invalid">@media</span> (max-width: 1024px) {
.toc-fixed {
transform: translateX(-100%);
transition: transform 0.3s ease;
}
.toc-fixed.open {
transform: translateX(0);
}
.content-offset {
margin-left: 0;
}
/* Responsive mermaid controls */
.mermaid-control-btn:not(.reset-zoom) {
display: none;
}
.mermaid-controls {
top: auto;
bottom: 15px;
right: 15px;
}
}
<span class="mention-invalid">@media</span> (max-width: 768px) {
section.relative h1 {
font-size: 1.5rem;
}
section.relative p {
font-size: 1rem;
}
}
/* Prevent horizontal overflow on small screens */
body {
overflow-x: hidden;
}
</style>
<base target="_blank">
</head>
<body class="font-sans text-primary bg-background leading-relaxed overflow-x-hidden break-words">
<!-- Toggle button for small screens -->
<button id="toc-toggle" class="fixed top-4 left-4 z-50 p-2 bg-accent text-white rounded-lg shadow-lg md:hidden">
<i class="fas fa-bars"></i>
</button>
<!-- Fixed Table of Contents -->
<nav class="toc-fixed" id="toc">
<div class="mb-8">
<h2 class="text-lg font-bold text-primary mb-4">Table of Contents</h2>
<div class="space-y-2 text-sm">
<a href="#introduction" class="toc-link block text-muted hover:text-accent transition-colors">Introduction</a>
<a href="#framework" class="toc-link block text-muted hover:text-accent transition-colors">1. The Self-Evolving Agent Framework</a>
<div class="ml-4 space-y-1">
<a href="#core-challenge" class="toc-link block text-muted hover:text-accent transition-colors text-xs">1.1 The Core Challenge</a>
<a href="#self-evolving-loop" class="toc-link block text-muted hover:text-accent transition-colors text-xs">1.2 The Self-Evolving Loop</a>
<a href="#use-case" class="toc-link block text-muted hover:text-accent transition-colors text-xs">1.3 Healthcare Use Case</a>
</div>
<a href="#manual-optimization" class="toc-link block text-muted hover:text-accent transition-colors">2. Manual Prompt Optimization</a>
<div class="ml-4 space-y-1">
<a href="#platform-workflow" class="toc-link block text-muted hover:text-accent transition-colors text-xs">2.1 Platform Workflow</a>
<a href="#step-by-step" class="toc-link block text-muted hover:text-accent transition-colors text-xs">2.2 Step-by-Step Process</a>
</div>
<a href="#automated-healing" class="toc-link block text-muted hover:text-accent transition-colors">3. Automated Self-Healing</a>
<div class="ml-4 space-y-1">
<a href="#system-architecture" class="toc-link block text-muted hover:text-accent transition-colors text-xs">3.1 System Architecture</a>
<a href="#evaluation-suite" class="toc-link block text-muted hover:text-accent transition-colors text-xs">3.2 Evaluation Suite</a>
<a href="#orchestration" class="toc-link block text-muted hover:text-accent transition-colors text-xs">3.3 Orchestration</a>
</div>
<a href="#advanced-strategies" class="toc-link block text-muted hover:text-accent transition-colors">4. Advanced Strategies</a>
<div class="ml-4 space-y-1">
<a href="#model-evaluation" class="toc-link block text-muted hover:text-accent transition-colors text-xs">4.1 Model Evaluation</a>
<a href="#gepa" class="toc-link block text-muted hover:text-accent transition-colors text-xs">4.2 GEPA Framework</a>
</div>
<a href="#appendix" class="toc-link block text-muted hover:text-accent transition-colors">5. Appendix</a>
</div>
</div>
</nav>
<!-- Main Content -->
<main class="content-offset">
<!-- Hero Section -->
<section class="hero-gradient relative overflow-hidden">
<div class="absolute inset-0 bg-black/20"></div>
<img src="https://kimi-web-img.moonshot.cn/img/smythos.com/77275013d45c2dcb10d40abd4ce7ffa04db91d9c.jpg" alt="Abstract representation of an AI agent improvement loop" class="absolute inset-0 w-full h-full object-cover opacity-10" size="wallpaper" aspect="wide" query="AI agent self improvement abstract" referrerpolicy="no-referrer" data-modified="1" data-score="0.00"/>
<div class="relative z-10 container mx-auto px-8 py-16 md:py-24">
<div class="grid grid-cols-1 md:grid-cols-12 gap-8 items-center">
<!-- Title and Subtitle -->
<div class="md:col-span-8">
<h1 class="font-serif text-3xl md:text-6xl font-bold text-white mb-6 text-shadow italic">
A Cookbook for Building
<span class="block text-accent">Self-Evolving Agents</span>
</h1>
<p class="text-lg md:text-2xl text-gray-200 mb-8 leading-relaxed">
A Framework for Continuous Improvement in Production
</p>
<div class="flex items-center space-x-4 text-gray-300">
<span class="flex items-center">
<i class="fas fa-robot mr-2 text-accent"></i>
AI Systems
</span>
<span class="flex items-center">
<i class="fas fa-sync-alt mr-2 text-accent"></i>
Continuous Learning
</span>
</div>
</div>
<!-- Key Highlights -->
<div class="md:col-span-4 mt-8 md:mt-0">
<div class="glass-effect rounded-lg p-6 backdrop-blur-sm">
<h3 class="text-white font-semibold mb-4">What You'll Learn</h3>
<ul class="space-y-3 text-sm text-gray-200">
<li class="flex items-start">
<i class="fas fa-check-circle text-accent mr-3 mt-1"></i>
Diagnose why autonomous agents fall short of production readiness
</li>
<li class="flex items-start">
<i class="fas fa-check-circle text-accent mr-3 mt-1"></i>
Compare three prompt-optimization strategies
</li>
<li class="flex items-start">
<i class="fas fa-check-circle text-accent mr-3 mt-1"></i>
Assemble a self-healing workflow with human review and LLM evals
</li>
</ul>
</div>
</div>
</div>
</div>
</section>
<!-- Introduction -->
<section id="introduction" class="py-16 bg-background">
<div class="container mx-auto px-8 max-w-4xl">
<div class="prose prose-lg max-w-none">
<p class="text-xl text-muted leading-relaxed mb-8">
This cookbook provides a practical framework for building self-evolving agents that can learn from their mistakes and improve their performance over time. By combining human feedback, automated evaluation using an "LLM-as-a-judge," and iterative prompt optimization, you can move beyond brittle proof-of-concept demos to create robust, production-ready systems.
</p>
<div class="grid grid-cols-1 md:grid-cols-3 gap-8 my-12">
<div class="bg-surface p-6 rounded-lg border border-border">
<i class="fas fa-microscope text-accent text-2xl mb-4"></i>
<h3 class="font-semibold text-primary mb-2">ML/AI Engineers</h3>
<p class="text-sm text-muted">Move beyond toy demos with executable artifacts for production pipelines</p>
</div>
<div class="bg-surface p-6 rounded-lg border border-border">
<i class="fas fa-users text-accent text-2xl mb-4"></i>
<h3 class="font-semibold text-primary mb-2">Product Teams</h3>
<p class="text-sm text-muted">Adapt internal tooling with accuracy, auditability, and rapid iteration</p>
</div>
<div class="bg-surface p-6 rounded-lg border border-border">
<i class="fas fa-cogs text-accent text-2xl mb-4"></i>
<h3 class="font-semibold text-primary mb-2">Solution Architects</h3>
<p class="text-sm text-muted">Design systems that learn and improve autonomously in production</p>
</div>
</div>
</div>
</div>
</section>
<div class="section-divider"></div>
<!-- Section 1: The Self-Evolving Agent Framework -->
<section id="framework" class="py-16">
<div class="container mx-auto px-8 max-w-4xl">
<h2 class="font-serif text-4xl font-bold text-primary mb-8">1. The Self-Evolving Agent Framework</h2>
<div id="core-challenge" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">1.1 The Core Challenge: Overcoming the Post-Proof-of-Concept Plateau</h3>
<p class="text-lg text-muted mb-6">
A significant and recurring challenge in the development of agentic systems is the plateau in performance and reliability that often follows an initial proof-of-concept. While early demonstrations can showcase the potential of Large Language Models (LLMs) to automate complex tasks, these systems frequently fall short of production readiness.
</p>
<div class="bg-amber-50 border-l-4 border-amber-400 p-6 mb-8">
<div class="flex">
<i class="fas fa-exclamation-triangle text-amber-400 mr-3 mt-1"></i>
<div>
<h4 class="font-semibold text-amber-800 mb-2">The Critical Gap</h4>
<p class="text-amber-700">The core issue lies in their inability to autonomously diagnose and correct failures, particularly the edge cases that emerge when exposed to the full complexity and variability of real-world data.</p>
</div>
</div>
</div>
<p class="text-lg text-muted mb-6">
This dependency on human intervention for continuous diagnosis and correction creates a bottleneck, hindering scalability and long-term viability. The <strong>self-evolving loop</strong> addresses this critical gap by introducing a repeatable and structured retraining loop designed to capture failures, learn from feedback, and iteratively promote improvements back into the production workflow.
</p>
</div>
<div id="self-evolving-loop" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">1.2 The Self-Evolving Loop: An Iterative Cycle of Feedback and Refinement</h3>
<!-- Self-Evolving Loop Diagram -->
<div class="bg-surface p-8 rounded-lg border border-border mb-8">
<h4 class="font-semibold text-primary mb-4">The Self-Evolving Loop Architecture</h4>
<div class="mermaid-container">
<div class="mermaid-controls">
<button class="mermaid-control-btn zoom-in" title="放大">
<i class="fas fa-search-plus"></i>
</button>
<button class="mermaid-control-btn zoom-out" title="缩小">
<i class="fas fa-search-minus"></i>
</button>
<button class="mermaid-control-btn reset-zoom" title="重置">
<i class="fas fa-expand-arrows-alt"></i>
</button>
<button class="mermaid-control-btn fullscreen" title="全屏查看">
<i class="fas fa-expand"></i>
</button>
</div>
<div class="mermaid" id="mermaid-1">
graph TD
A["Baseline Agent"] --> B["Generate Output"]
B --> C["Human Feedback"]
B --> D["LLM-as-Judge"]
C --> E["Evals & Aggregated Score"]
D --> E
E --> F{"Score > Threshold?"}
F -->|"No"| G["Prompt Optimization"]
F -->|"Yes"| H["Update Baseline Agent"]
G --> I["Generate New Prompt"]
I --> A
H --> A
style A fill:#fefefe,stroke:#0d9488,stroke-width:3px,color:#1a1a1a
style B fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
style C fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style D fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style E fill:#fffbeb,stroke:#d97706,stroke-width:2px,color:#1a1a1a
style F fill:#fef3c7,stroke:#d97706,stroke-width:3px,color:#1a1a1a
style G fill:#fdf2f8,stroke:#be185d,stroke-width:2px,color:#1a1a1a
style H fill:#ecfdf5,stroke:#059669,stroke-width:3px,color:#1a1a1a
style I fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
</div>
</div>
</div>
<p class="text-lg text-muted mb-6">
The central innovation of this cookbook is the <strong>"self-evolving loop,"</strong> a systematic and iterative process designed to enable continuous, autonomous improvement of an AI agent. This loop is engineered to move agentic systems beyond static, pre-programmed behaviors and into a state of dynamic learning and adaptation.
</p>
<!-- Five Stages -->
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6 my-8">
<div class="bg-gradient-to-br from-blue-50 to-blue-100 p-6 rounded-lg border border-blue-200">
<div class="text-center mb-4">
<i class="fas fa-play-circle text-3xl text-blue-600"></i>
</div>
<h4 class="font-semibold text-blue-900 mb-2">1. Baseline Agent</h4>
<p class="text-sm text-blue-700">Establish the initial benchmark with a deliberately simple agent</p>
</div>
<div class="bg-gradient-to-br from-green-50 to-green-100 p-6 rounded-lg border border-green-200">
<div class="text-center mb-4">
<i class="fas fa-comments text-3xl text-green-600"></i>
</div>
<h4 class="font-semibold text-green-900 mb-2">2. Feedback Collection</h4>
<p class="text-sm text-green-700">Gather structured feedback from humans and LLM-as-a-judge</p>
</div>
<div class="bg-gradient-to-br from-yellow-50 to-yellow-100 p-6 rounded-lg border border-yellow-200">
<div class="text-center mb-4">
<i class="fas fa-chart-line text-3xl text-yellow-600"></i>
</div>
<h4 class="font-semibold text-yellow-900 mb-2">3. Evaluation & Scoring</h4>
<p class="text-sm text-yellow-700">Measure performance using specialized graders</p>
</div>
<div class="bg-gradient-to-br from-purple-50 to-purple-100 p-6 rounded-lg border border-purple-200">
<div class="text-center mb-4">
<i class="fas fa-magic text-3xl text-purple-600"></i>
</div>
<h4 class="font-semibold text-purple-900 mb-2">4. Prompt Optimization</h4>
<p class="text-sm text-purple-700">Generate improved instructions based on feedback</p>
</div>
<div class="bg-gradient-to-br from-teal-50 to-teal-100 p-6 rounded-lg border border-teal-200">
<div class="text-center mb-4">
<i class="fas fa-arrow-up text-3xl text-teal-600"></i>
</div>
<h4 class="font-semibold text-teal-900 mb-2">5. Updated Agent</h4>
<p class="text-sm text-teal-700">Promote the best-performing version to production</p>
</div>
</div>
</div>
<div id="use-case" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">1.3 Use Case: Healthcare Regulatory Documentation</h3>
<div class="bg-surface p-8 rounded-lg border border-border mb-8">
<img src="https://kimi-web-img.moonshot.cn/img/pic.616pic.com/9aa9cd44d240b614c68607c2b53e4a406070db0d.jpg" alt="Pharmaceutical regulatory documents on a desk" class="w-full h-48 object-cover rounded-lg mb-6" size="medium" aspect="wide" style="photo" query="pharmaceutical regulatory documents" referrerpolicy="no-referrer" data-modified="1" data-score="0.00"/>
<p class="text-lg text-muted mb-6">
To ground the abstract concepts in a concrete, real-world scenario, this cookbook focuses on a challenging and high-stakes use case: the drafting of regulatory documents for the pharmaceutical industry. This domain demands an exceptionally high degree of accuracy, precision, and compliance.
</p>
<div class="grid grid-cols-1 md:grid-cols-2 gap-6">
<div>
<h4 class="font-semibold text-primary mb-3">Baseline Agent Architecture</h4>
<ul class="space-y-2 text-muted">
<li class="flex items-start">
<i class="fas fa-file-alt text-accent mr-2 mt-1"></i>
<span><strong>Summarizer:</strong> Creates scientific and concise summaries</span>
</li>
<li class="flex items-start">
<i class="fas fa-shield-alt text-accent mr-2 mt-1"></i>
<span><strong>Compliance Checker:</strong> Evaluates against FDA 21 CFR Part 11</span>
</li>
</ul>
</div>
<div>
<h4 class="font-semibold text-primary mb-3">Dataset</h4>
<ul class="space-y-2 text-muted">
<li class="flex items-start">
<i class="fas fa-database text-accent mr-2 mt-1"></i>
<span><strong>Source:</strong> Sample CMC Section for Hyperpolarized Pyruvate (13C) Injection</span>
</li>
<li class="flex items-start">
<i class="fas fa-list-ol text-accent mr-2 mt-1"></i>
<span><strong>Size:</strong> ~70 sections of technical documentation</span>
</li>
</ul>
</div>
</div>
</div>
</div>
</div>
</section>
<div class="section-divider"></div>
<!-- Section 2: Manual Prompt Optimization -->
<section id="manual-optimization" class="py-16 bg-surface">
<div class="container mx-auto px-8 max-w-4xl">
<h2 class="font-serif text-4xl font-bold text-primary mb-8">2. Manual Prompt Optimization with OpenAI Evals</h2>
<div id="platform-workflow" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">2.1 Workflow Overview</h3>
<p class="text-lg text-muted mb-8">
The OpenAI Evals platform provides a powerful and intuitive web-based interface for the manual optimization and evaluation of prompts. This approach is particularly well-suited for rapid prototyping and close collaboration with subject matter experts.
</p>
<!-- Platform Interface Workflow -->
<div class="bg-white p-8 rounded-lg border border-border mb-8">
<img src="https://kimi-web-img.moonshot.cn/img/s3.amazonaws.com/c2bd42c94bb1f04f205fda3e5c5d4bc7de69c335.png" alt="OpenAI Evals platform user interface" class="w-full h-64 object-cover rounded-lg mb-6" size="medium" aspect="wide" style="photo" query="OpenAI Evals platform interface" referrerpolicy="no-referrer" data-modified="1" data-score="0.00"/>
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<div>
<h4 class="font-semibold text-primary mb-4">Key Features</h4>
<ul class="space-y-3 text-muted">
<li class="flex items-start">
<i class="fas fa-upload text-accent mr-3 mt-1"></i>
<span>Dataset upload and exploration</span>
</li>
<li class="flex items-start">
<i class="fas fa-cog text-accent mr-3 mt-1"></i>
<span>Prompt configuration with variables</span>
</li>
<li class="flex items-start">
<i class="fas fa-play text-accent mr-3 mt-1"></i>
<span>Batch output generation</span>
</li>
</ul>
</div>
<div>
<h4 class="font-semibold text-primary mb-4">Optimization Tools</h4>
<ul class="space-y-3 text-muted">
<li class="flex items-start">
<i class="fas fa-star text-accent mr-3 mt-1"></i>
<span>Structured feedback collection</span>
</li>
<li class="flex items-start">
<i class="fas fa-magic text-accent mr-3 mt-1"></i>
<span>Automated prompt optimization</span>
</li>
<li class="flex items-start">
<i class="fas fa-chart-bar text-accent mr-3 mt-1"></i>
<span>Performance comparison across versions</span>
</li>
</ul>
</div>
</div>
</div>
</div>
<div id="step-by-step" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">2.2 Step-by-Step Process</h3>
<!-- Process Table -->
<div class="overflow-x-auto mb-8">
<table class="w-full bg-white rounded-lg border border-border">
<thead class="bg-surface">
<tr>
<th class="px-6 py-4 text-left font-semibold text-primary">Step</th>
<th class="px-6 py-4 text-left font-semibold text-primary">Action</th>
<th class="px-6 py-4 text-left font-semibold text-primary">Description</th>
</tr>
</thead>
<tbody class="divide-y divide-border">
<tr>
<td class="px-6 py-4 font-semibold text-accent">1</td>
<td class="px-6 py-4 font-medium">Upload Dataset</td>
<td class="px-6 py-4 text-muted">Upload CSV containing inputs for the agent</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">2</td>
<td class="px-6 py-4 font-medium">Explore Data</td>
<td class="px-6 py-4 text-muted">Verify data is properly formatted and complete</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">3</td>
<td class="px-6 py-4 font-medium">Configure Prompt</td>
<td class="px-6 py-4 text-muted">Define system prompt, user template, and model settings</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">4</td>
<td class="px-6 py-4 font-medium">Generate Outputs</td>
<td class="px-6 py-4 text-muted">Run prompt against dataset to create baseline</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">5</td>
<td class="px-6 py-4 font-medium">Review & Evaluate</td>
<td class="px-6 py-4 text-muted">Provide structured feedback with ratings and comments</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">6</td>
<td class="px-6 py-4 font-medium">Optimize Prompt</td>
<td class="px-6 py-4 text-muted">Use automated optimization based on feedback</td>
</tr>
<tr>
<td class="px-6 py-4 font-semibold text-accent">7</td>
<td class="px-6 py-4 font-medium">Iterate & Compare</td>
<td class="px-6 py-4 text-muted">Repeat cycle until performance is satisfactory</td>
</tr>
</tbody>
</table>
</div>
<div class="bg-blue-50 border-l-4 border-blue-400 p-6">
<div class="flex">
<i class="fas fa-lightbulb text-blue-400 mr-3 mt-1"></i>
<div>
<h4 class="font-semibold text-blue-800 mb-2">Pro Tip</h4>
<p class="text-blue-700">Start with a very simple prompt like "summarize" to clearly demonstrate the power of the optimization process. The platform's ability to evolve from minimal starting points is remarkable.</p>
</div>
</div>
</div>
</div>
</div>
</section>
<div class="section-divider"></div>
<!-- Section 3: Automated Self-Healing -->
<section id="automated-healing" class="py-16">
<div class="container mx-auto px-8 max-w-4xl">
<h2 class="font-serif text-4xl font-bold text-primary mb-8">3. Automated Self-Healing Loop</h2>
<div id="system-architecture" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">3.1 System Architecture</h3>
<p class="text-lg text-muted mb-8">
This section introduces a fully automated, programmatic approach to the self-evolving loop, eliminating the need for any user interface. This API-driven workflow is designed for scalability and is well-suited for integration into production pipelines and CI/CD environments.
</p>
<!-- System Components -->
<div class="grid grid-cols-1 md:grid-cols-2 gap-8 mb-8">
<div class="bg-white p-6 rounded-lg border border-border">
<div class="flex items-center mb-4">
<i class="fas fa-robot text-2xl text-accent mr-3"></i>
<h4 class="font-semibold text-primary">Summarization Agent</h4>
</div>
<p class="text-muted text-sm">Primary agent performing the document summarization task</p>
</div>
<div class="bg-white p-6 rounded-lg border border-border">
<div class="flex items-center mb-4">
<i class="fas fa-magic text-2xl text-accent mr-3"></i>
<h4 class="font-semibold text-primary">Metaprompt Agent</h4>
</div>
<p class="text-muted text-sm">Separate agent responsible for prompt optimization</p>
</div>
<div class="bg-white p-6 rounded-lg border border-border">
<div class="flex items-center mb-4">
<i class="fas fa-chart-bar text-2xl text-accent mr-3"></i>
<h4 class="font-semibold text-primary">Evaluation Suite</h4>
</div>
<p class="text-muted text-sm">Collection of specialized graders for quality assessment</p>
</div>
<div class="bg-white p-6 rounded-lg border border-border">
<div class="flex items-center mb-4">
<i class="fas fa-cogs text-2xl text-accent mr-3"></i>
<h4 class="font-semibold text-primary">Orchestration Logic</h4>
</div>
<p class="text-muted text-sm">Python functions managing the feedback loop workflow</p>
</div>
</div>
</div>
<div id="evaluation-suite" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">3.2 Building the Evaluation Suite</h3>
<!-- Graders Table -->
<div class="overflow-x-auto mb-8">
<table class="w-full bg-white rounded-lg border border-border">
<thead class="bg-surface">
<tr>
<th class="px-6 py-4 text-left font-semibold text-primary">Grader</th>
<th class="px-6 py-4 text-left font-semibold text-primary">Type</th>
<th class="px-6 py-4 text-left font-semibold text-primary">Pass Threshold</th>
<th class="px-6 py-4 text-left font-semibold text-primary">What It Checks</th>
</tr>
</thead>
<tbody class="divide-y divide-border">
<tr>
<td class="px-6 py-4 font-medium text-blue-700">Chemical Name Preservation</td>
<td class="px-6 py-4 text-muted">Python</td>
<td class="px-6 py-4 font-semibold text-blue-600">0.8</td>
<td class="px-6 py-4 text-muted">Ensures all chemical names appear in summary</td>
</tr>
<tr>
<td class="px-6 py-4 font-medium text-green-700">Summary Length Adherence</td>
<td class="px-6 py-4 text-muted">Python</td>
<td class="px-6 py-4 font-semibold text-green-600">0.85</td>
<td class="px-6 py-4 text-muted">Measures deviation from 100-word target</td>
</tr>
<tr>
<td class="px-6 py-4 font-medium text-yellow-700">Semantic Similarity</td>
<td class="px-6 py-4 text-muted">Cosine Similarity</td>
<td class="px-6 py-4 font-semibold text-yellow-600">0.85</td>
<td class="px-6 py-4 text-muted">Calculates semantic overlap with source</td>
</tr>
<tr>
<td class="px-6 py-4 font-medium text-purple-700">Holistic Quality Assessment</td>
<td class="px-6 py-4 text-muted">LLM-as-a-Judge</td>
<td class="px-6 py-4 font-semibold text-purple-600">0.85</td>
<td class="px-6 py-4 text-muted">Rubric-driven score from evaluator model</td>
</tr>
</tbody>
</table>
</div>
<!-- Evaluation Process Flow -->
<div class="bg-surface p-8 rounded-lg border border-border mb-8">
<h4 class="font-semibold text-primary mb-6">Evaluation Process Flow</h4>
<div class="mermaid-container">
<div class="mermaid-controls">
<button class="mermaid-control-btn zoom-in" title="放大">
<i class="fas fa-search-plus"></i>
</button>
<button class="mermaid-control-btn zoom-out" title="缩小">
<i class="fas fa-search-minus"></i>
</button>
<button class="mermaid-control-btn reset-zoom" title="重置">
<i class="fas fa-expand-arrows-alt"></i>
</button>
<button class="mermaid-control-btn fullscreen" title="全屏查看">
<i class="fas fa-expand"></i>
</button>
</div>
<div class="mermaid" id="mermaid-2">
graph LR
A["Agent Output"] --> B["Chemical Grader"]
A --> C["Length Grader"]
A --> D["Similarity Grader"]
A --> E["LLM Judge"]
B --> F["Chemical Score: 0.8"]
C --> G["Length Score: 0.85"]
D --> H["Similarity Score: 0.9"]
E --> I["Quality Score: 0.85"]
F --> J["Aggregate Score: 0.85"]
G --> J
H --> J
I --> J
style A fill:#fefefe,stroke:#0d9488,stroke-width:3px,color:#1a1a1a
style J fill:#f0f9ff,stroke:#0369a1,stroke-width:3px,color:#1a1a1a
style B fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style C fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style D fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style E fill:#f0fdf4,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style F fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#1a1a1a
style G fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#1a1a1a
style H fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#1a1a1a
style I fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#1a1a1a
</div>
</div>
</div>
</div>
<div id="orchestration" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">3.3 Orchestration and Monitoring</h3>
<p class="text-lg text-muted mb-8">
The orchestration logic brings together all components and coordinates their actions to create a seamless, automated workflow. This includes agent versioning, feedback translation, and promotion decisions.
</p>
<!-- Monitoring Dashboard -->
<div class="bg-surface p-8 rounded-lg border border-border">
<img src="https://kimi-web-img.moonshot.cn/img/images.klipfolio.com/6ca28218882fe0b3b416a84b7aafc850f4f33bd7.png" alt="Monitoring dashboard with metrics and graphs" class="w-full h-48 object-cover rounded-lg mb-6" size="medium" aspect="wide" style="photo" query="software monitoring dashboard" referrerpolicy="no-referrer" data-modified="1" data-score="0.00"/>
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<div>
<h4 class="font-semibold text-primary mb-4">Observability Features</h4>
<ul class="space-y-3 text-muted">
<li class="flex items-start">
<i class="fas fa-chart-line text-accent mr-3 mt-1"></i>
<span><strong>Dashboard Tracing:</strong> Real-time workflow visualization</span>
</li>
<li class="flex items-start">
<i class="fas fa-history text-accent mr-3 mt-1"></i>
<span><strong>Version History:</strong> Complete prompt evolution tracking</span>
</li>
<li class="flex items-start">
<i class="fas fa-clock text-accent mr-3 mt-1"></i>
<span><strong>Performance Metrics:</strong> Latency and throughput monitoring</span>
</li>
</ul>
</div>
<div>
<h4 class="font-semibold text-primary mb-4">Production Monitoring</h4>
<ul class="space-y-3 text-muted">
<li class="flex items-start">
<i class="fas fa-sync-alt text-accent mr-3 mt-1"></i>
<span><strong>Continuous Monitoring:</strong> Scheduled re-evaluation</span>
</li>
<li class="flex items-start">
<i class="fas fa-exclamation-triangle text-accent mr-3 mt-1"></i>
<span><strong>Drift Detection:</strong> Performance degradation alerts</span>
</li>
<li class="flex items-start">
<i class="fas fa-redo text-accent mr-3 mt-1"></i>
<span><strong>Auto-Recovery:</strong> Automatic rollback to stable versions</span>
</li>
</ul>
</div>
</div>
</div>
</div>
</div>
</section>
<div class="section-divider"></div>
<!-- Section 4: Advanced Strategies -->
<section id="advanced-strategies" class="py-16 bg-surface">
<div class="container mx-auto px-8 max-w-4xl">
<h2 class="font-serif text-4xl font-bold text-primary mb-8">4. Advanced Optimization Strategies</h2>
<div id="model-evaluation" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">4.1 Model Evaluation and Selection</h3>
<p class="text-lg text-muted mb-8">
The self-evolving loop can be extended beyond prompt optimization to include the evaluation and selection of different model candidates, automatically finding the optimal balance between performance and cost.
</p>
<!-- Model Comparison -->
<div class="bg-white p-8 rounded-lg border border-border mb-8">
<h4 class="font-semibold text-primary mb-6">Model Comparison Workflow</h4>
<div class="mermaid-container">
<div class="mermaid-controls">
<button class="mermaid-control-btn zoom-in" title="放大">
<i class="fas fa-search-plus"></i>
</button>
<button class="mermaid-control-btn zoom-out" title="缩小">
<i class="fas fa-search-minus"></i>
</button>
<button class="mermaid-control-btn reset-zoom" title="重置">
<i class="fas fa-expand-arrows-alt"></i>
</button>
<button class="mermaid-control-btn fullscreen" title="全屏查看">
<i class="fas fa-expand"></i>
</button>
</div>
<div class="mermaid" id="mermaid-3">
graph TD
A["Improved Prompt"] --> B["Evaluate with GPT-5"]
A --> C["Evaluate with GPT-5-mini"]
A --> D["Evaluate with GPT-5-nano"]
B --> E["Score: 0.92"]
C --> F["Score: 0.88"]
D --> G["Score: 0.85"]
E --> H{"Select Best Model"}
F --> H
G --> H
H --> I["GPT-5 Selected"]
H --> J["Cost Analysis: $0.12/query"]
H --> K["Performance: +8% improvement"]
style A fill:#fefefe,stroke:#0d9488,stroke-width:3px,color:#1a1a1a
style I fill:#ecfdf5,stroke:#059669,stroke-width:3px,color:#1a1a1a
style B fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
style C fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
style D fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
style E fill:#ecfdf5,stroke:#16a34a,stroke-width:2px,color:#1a1a1a
style F fill:#fef3c7,stroke:#d97706,stroke-width:2px,color:#1a1a1a
style G fill:#fee2e2,stroke:#dc2626,stroke-width:2px,color:#1a1a1a
style H fill:#f0f9ff,stroke:#0369a1,stroke-width:3px,color:#1a1a1a
style J fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
style K fill:#f0f9ff,stroke:#0369a1,stroke-width:2px,color:#1a1a1a
</div>
</div>
</div>
</div>
<div id="gepa" class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">4.2 Prompt Optimization with Genetic-Pareto (GEPA)</h3>
<p class="text-lg text-muted mb-8">
The Genetic-Pareto (GEPA) framework represents a more advanced approach to prompt optimization, employing an evolutionary process with reflective, language-based updates to find robust, generalized prompts.
</p>
<div class="bg-white p-8 rounded-lg border border-border mb-8">
<img src="https://kimi-web-img.moonshot.cn/img/media.springernature.com/913e169745aaae72d604de7f643cfdb8e7663be4.png" alt="Abstract representation of evolutionary algorithm concept" class="w-full h-40 object-cover rounded-lg mb-6" size="medium" aspect="wide" query="evolutionary algorithm abstract" referrerpolicy="no-referrer" data-modified="1" data-score="0.00"/>
<div class="mb-6">
<h4 class="font-semibold text-primary mb-4">GEPA Framework Benefits</h4>
<div class="grid grid-cols-1 md:grid-cols-2 gap-6">
<div class="space-y-3">
<div class="flex items-start">
<i class="fas fa-brain text-accent mr-3 mt-1"></i>
<span class="text-muted"><strong>Reflective Evolution:</strong> Analyzes performance and proposes intelligent improvements</span>
</div>
<div class="flex items-start">
<i class="fas fa-shield-alt text-accent mr-3 mt-1"></i>
<span class="text-muted"><strong>Generalization:</strong> Uses training/validation sets to prevent overfitting</span>
</div>
</div>
<div class="space-y-3">
<div class="flex items-start">
<i class="fas fa-dna text-accent mr-3 mt-1"></i>
<span class="text-muted"><strong>Evolutionary Approach:</strong> Samples trajectories and reflects on feedback</span>
</div>
<div class="flex items-start">
<i class="fas fa-certificate text-accent mr-3 mt-1"></i>
<span class="text-muted"><strong>Empirical Evidence:</strong> Clear performance validation across datasets</span>
</div>
</div>
</div>
</div>
<div class="bg-gray-50 p-6 rounded-lg">
<p class="text-sm text-muted">
<strong>Citation:</strong>
<a href="https://arxiv.org/abs/2507.19457" class="citation-link" target="_blank">
GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning
</a> by Agrawal et al.
</p>
</div>
</div>
</div>
</div>
</section>
<div class="section-divider"></div>
<!-- Section 5: Appendix -->
<section id="appendix" class="py-16">
<div class="container mx-auto px-8 max-w-4xl">
<h2 class="font-serif text-4xl font-bold text-primary mb-8">5. Appendix</h2>
<div class="mb-16">
<h3 class="font-serif text-2xl font-semibold text-primary mb-6">5.1 Example Prompts from Each Optimization Method</h3>
<!-- Prompt Examples -->
<div class="space-y-8">
<div class="bg-white border border-border rounded-lg overflow-hidden">
<div class="bg-gray-50 px-6 py-4 border-b border-border">
<h4 class="font-semibold text-primary">Initial Baseline Prompt</h4>
</div>
<div class="p-6">
<pre class="text-sm text-muted bg-gray-50 p-4 rounded overflow-x-auto">You are a summarization assistant.
Given a section of text, produce a summary.</pre>
</div>
</div>
<div class="bg-white border border-border rounded-lg overflow-hidden">
<div class="bg-blue-50 px-6 py-4 border-b border-border">
<h4 class="font-semibold text-primary">OpenAI Platform Optimizer Output</h4>
</div>
<div class="p-6">
<pre class="text-sm text-muted bg-blue-50 p-4 rounded overflow-x-auto">You are a summarization assistant.
Task: Summarize the provided text concisely and accurately.
Output requirements:
- Output only the summary. Do not add titles, labels (e.g., "Summary:"), prefaces, or commentary.
- Preserve the document's structure. If multiple sections/subsections appear, summarize each one.
- Use a numbered list for sections/subsections (use their numbers/titles when present).
- Under each, use short dash bullets for key points.
- If there is only a single short section, return a brief bullet list or 1-2 concise sentences.
- Split any inline lists into separate bullets.
- Use plain, simple language. Keep bullets tight (ideally one line each). Remove redundancy.
- Include important quantitative details (values, units, conditions) and constraints. Do not invent information.
- Keep formatting simple: plain text, "1." numbering and "-" bullets only. No tables or special markup.
- Retain exact technical terms/notation from the source (e.g., chemical names, isotopic labels).
- If a section is explicitly marked "Not applicable," include that status; otherwise do not add it.</pre>
</div>
</div>
<div class="bg-white border border-border rounded-lg overflow-hidden">
<div class="bg-green-50 px-6 py-4 border-b border-border">
<h4 class="font-semibold text-primary">Static Metaprompt Output</h4>
</div>
<div class="p-6">
<pre class="text-sm text-muted bg-green-50 p-4 rounded overflow-x-auto">You are a technical summarization assistant for scientific and regulatory documentation. Your task is to generate a concise, comprehensive, and fully detailed summary of any scientific, technical, or regulatory text provided. Strictly adhere to the following instructions:
---
**1. Complete and Exact Information Inclusion**
- Capture *every* explicit fact, technical value, specification, quantity, measurement, regulatory reference, entity, process, site, and contextual detail verbatim from the source text.
- Do not omit or generalize any explicit information, no matter how minor.
**2. Precise Terminology and Named Entity Retention**
- Reproduce all names of chemicals, drugs, mixtures, buffer components, devices, companies, institutions, regulatory standards, section numbers, and procedural labels *exactly as stated*.
- Report all quantities, measurements, concentrations, ratios, masses, volumes, compositions, pH values, and units precisely as given.
- Do not paraphrase, rename, substitute, or simplify any term or value.
... [additional detailed instructions] ...</pre>
</div>
</div>
<div class="bg-white border border-border rounded-lg overflow-hidden">
<div class="bg-purple-50 px-6 py-4 border-b border-border">
<h4 class="font-semibold text-primary">GEPA Optimizer Output</h4>
</div>
<div class="p-6">
<pre class="text-sm text-muted bg-purple-50 p-4 rounded overflow-x-auto">You are a domain-aware summarization assistant for technical pharmaceutical texts. Given a "section" of text, produce a concise, single-paragraph summary that preserves key technical facts and exact nomenclature.
Length and format
- Write 1–3 sentences totaling about 45–70 words (target ~60; never exceed 90).
- Use one paragraph; no bullets, headings, tables, or heavy formatting.
Exact names and notation
- Include every chemical name that appears in the section at least once, using the exact original spelling, capitalization, punctuation, isotopic labels, brackets, hyphens, salts, buffer names, and parenthetical qualifiers...
... [highly detailed domain-specific instructions] ...
Self-check before finalizing
- Does the paragraph contain every distinct chemical name exactly as written in the section?
- Is the summary 45–70 words (≤90), in a single paragraph?
- Are the most critical process/regulatory/testing details preserved?</pre>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Footer -->
<footer class="bg-primary text-white py-12">
<div class="container mx-auto px-8 max-w-4xl">
<div class="grid grid-cols-1 md:grid-cols-3 gap-8">
<div>
<h3 class="font-serif text-xl font-semibold mb-4">Contributors</h3>
<ul class="space-y-2 text-gray-300 text-sm">
<li>Calvin Maguranis</li>
<li>Fanny Perraudeau</li>
<li>Giorgio Saladino</li>
<li>Shikhar Kwatra</li>
<li>Valentina Frenkel</li>
</ul>
</div>
<div>
<h3 class="font-serif text-xl font-semibold mb-4">Citations</h3>
<p class="text-gray-300 text-sm">
<a href="https://arxiv.org/abs/2507.19457" class="citation-link text-accent" target="_blank">
GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning
</a>
</p>
</div>
<div>
<h3 class="font-serif text-xl font-semibold mb-4">Resources</h3>
<ul class="space-y-2 text-gray-300 text-sm">
<li>
<a href="#" class="citation-link text-accent">OpenAI Evals Platform</a>
</li>
<li>
<a href="#" class="citation-link text-accent">Agents SDK Documentation</a>
</li>
<li>
<a href="#" class="citation-link text-accent">Sample Dataset</a>
</li>
</ul>
</div>
</div>
<div class="border-t border-gray-700 mt-8 pt-8 text-center text-gray-400 text-sm">
<p>A joint collaboration between Bain and OpenAI</p>
</div>
</div>
</footer>
</main>
<script>
// Initialize Mermaid with enhanced configuration
mermaid.initialize({
startOnLoad: true,
theme: 'base',
themeVariables: {
primaryColor: '#fefefe',
primaryTextColor: '#1a1a1a',
primaryBorderColor: '#0d9488',
lineColor: '#64748b',
secondaryColor: '#f1f5f9',
tertiaryColor: '#fef3c7',
background: '#fefefe',
mainBkg: '#fefefe',
secondBkg: '#f0f9ff',
tertiaryBkg: '#f1f5f9',
nodeBorder: '#0d9488',
clusterBkg: '#f8fafc',
edgeLabelBackground: '#ffffff',
nodeTextColor: '#1a1a1a'
},
flowchart: {
useMaxWidth: true,
htmlLabels: true,
curve: 'basis',
padding: 30,
nodeSpacing: 50,
rankSpacing: 80,
diagramPadding: 20
},
timeline: {
useMaxWidth: true,
padding: 30,
axisFormat: '%Y-%m-%d'
},
gantt: {
useMaxWidth: true,
padding: 30
},
fontFamily: 'Inter, sans-serif',
fontSize: 14,
securityLevel: 'loose'
});
// Initialize Mermaid Controls for zoom and pan
function initializeMermaidControls() {
const containers = document.querySelectorAll('.mermaid-container');
containers.forEach(container => {
const mermaidElement = container.querySelector('.mermaid');
let scale = 1;
let isDragging = false;
let startX, startY, translateX = 0, translateY = 0;
// 触摸相关状态
let isTouch = false;
let touchStartTime = 0;
let initialDistance = 0;
let initialScale = 1;
let isPinching = false;
// Zoom controls
const zoomInBtn = container.querySelector('.zoom-in');
const zoomOutBtn = container.querySelector('.zoom-out');
const resetBtn = container.querySelector('.reset-zoom');
const fullscreenBtn = container.querySelector('.fullscreen');
function updateTransform() {
mermaidElement.style.transform = `translate(${translateX}px, ${translateY}px) scale(${scale})`;
if (scale > 1) {
container.classList.add('zoomed');
} else {
container.classList.remove('zoomed');
}
mermaidElement.style.cursor = isDragging ? 'grabbing' : 'grab';
}
if (zoomInBtn) {
zoomInBtn.addEventListener('click', () => {
scale = Math.min(scale * 1.25, 4);
updateTransform();
});
}
if (zoomOutBtn) {
zoomOutBtn.addEventListener('click', () => {
scale = Math.max(scale / 1.25, 0.3);
if (scale <= 1) {
translateX = 0;
translateY = 0;
}
updateTransform();
});
}
if (resetBtn) {
resetBtn.addEventListener('click', () => {
scale = 1;
translateX = 0;
translateY = 0;
updateTransform();
});
}
if (fullscreenBtn) {
fullscreenBtn.addEventListener('click', () => {
if (container.requestFullscreen) {
container.requestFullscreen();
} else if (container.webkitRequestFullscreen) {
container.webkitRequestFullscreen();
} else if (container.msRequestFullscreen) {
container.msRequestFullscreen();
}
});
}
// Mouse Events
mermaidElement.addEventListener('mousedown', (e) => {
if (isTouch) return; // 如果是触摸设备,忽略鼠标事件
isDragging = true;
startX = e.clientX - translateX;
startY = e.clientY - translateY;
mermaidElement.style.cursor = 'grabbing';
updateTransform();
e.preventDefault();
});
document.addEventListener('mousemove', (e) => {
if (isDragging && !isTouch) {
translateX = e.clientX - startX;
translateY = e.clientY - startY;
updateTransform();
}
});
document.addEventListener('mouseup', () => {
if (isDragging && !isTouch) {
isDragging = false;
mermaidElement.style.cursor = 'grab';
updateTransform();
}
});
document.addEventListener('mouseleave', () => {
if (isDragging && !isTouch) {
isDragging = false;
mermaidElement.style.cursor = 'grab';
updateTransform();
}
});
// 获取两点之间的距离
function getTouchDistance(touch1, touch2) {
return Math.hypot(
touch2.clientX - touch1.clientX,
touch2.clientY - touch1.clientY
);
}
// Touch Events - 触摸事件处理
mermaidElement.addEventListener('touchstart', (e) => {
isTouch = true;
touchStartTime = Date.now();
if (e.touches.length === 1) {
// 单指拖动
isPinching = false;
isDragging = true;
const touch = e.touches[0];
startX = touch.clientX - translateX;
startY = touch.clientY - translateY;
} else if (e.touches.length === 2) {
// 双指缩放
isPinching = true;
isDragging = false;
const touch1 = e.touches[0];
const touch2 = e.touches[1];
initialDistance = getTouchDistance(touch1, touch2);
initialScale = scale;
}
e.preventDefault();
}, { passive: false });
mermaidElement.addEventListener('touchmove', (e) => {
if (e.touches.length === 1 && isDragging && !isPinching) {
// 单指拖动
const touch = e.touches[0];
translateX = touch.clientX - startX;
translateY = touch.clientY - startY;
updateTransform();
} else if (e.touches.length === 2 && isPinching) {
// 双指缩放
const touch1 = e.touches[0];
const touch2 = e.touches[1];
const currentDistance = getTouchDistance(touch1, touch2);
if (initialDistance > 0) {
const newScale = Math.min(Math.max(
initialScale * (currentDistance / initialDistance),
0.3
), 4);
scale = newScale;
updateTransform();
}
}
e.preventDefault();
}, { passive: false });
mermaidElement.addEventListener('touchend', (e) => {
// 重置状态
if (e.touches.length === 0) {
isDragging = false;
isPinching = false;
initialDistance = 0;
// 延迟重置isTouch,避免鼠标事件立即触发
setTimeout(() => {
isTouch = false;
}, 100);
} else if (e.touches.length === 1 && isPinching) {
// 从双指变为单指,切换为拖动模式
isPinching = false;
isDragging = true;
const touch = e.touches[0];
startX = touch.clientX - translateX;
startY = touch.clientY - translateY;
}
updateTransform();
});
mermaidElement.addEventListener('touchcancel', (e) => {
isDragging = false;
isPinching = false;
initialDistance = 0;
setTimeout(() => {
isTouch = false;
}, 100);
updateTransform();
});
// Enhanced wheel zoom with better center point handling
container.addEventListener('wheel', (e) => {
e.preventDefault();
const rect = container.getBoundingClientRect();
const centerX = rect.width / 2;
const centerY = rect.height / 2;
const delta = e.deltaY > 0 ? 0.9 : 1.1;
const newScale = Math.min(Math.max(scale * delta, 0.3), 4);
// Adjust translation to zoom towards center
if (newScale !== scale) {
const scaleDiff = newScale / scale;
translateX = translateX * scaleDiff;
translateY = translateY * scaleDiff;
scale = newScale;
if (scale <= 1) {
translateX = 0;
translateY = 0;
}
updateTransform();
}
});
// Initialize display
updateTransform();
});
}
// Initialize the controls when the DOM is loaded
document.addEventListener('DOMContentLoaded', function() {
initializeMermaidControls();
});
// Smooth scrolling for anchor links
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
anchor.addEventListener('click', function (e) {
e.preventDefault();
const target = document.querySelector(this.getAttribute('href'));
if (target) {
target.scrollIntoView({
behavior: 'smooth',
block: 'start'
});
}
});
});
// Highlight current section in TOC
window.addEventListener('scroll', () => {
const sections = document.querySelectorAll('section[id]');
const scrollPos = window.scrollY + 100;
sections.forEach(section => {
const top = section.offsetTop;
const bottom = top + section.offsetHeight;
const id = section.getAttribute('id');
const link = document.querySelector(`a[href="#${id}"]`);
if (link && scrollPos >= top && scrollPos <= bottom) {
document.querySelectorAll('.toc-link').forEach(l => l.classList.remove('text-accent', 'font-semibold'));
link.classList.add('text-accent', 'font-semibold');
}
});
});
// Toggle TOC on small screens
const tocToggle = document.getElementById('toc-toggle');
const toc = document.getElementById('toc');
tocToggle.addEventListener('click', () => {
toc.classList.toggle('open');
});
// Close TOC when clicking on a link (on small screens)
document.querySelectorAll('.toc-link').forEach(link => {
link.addEventListener('click', () => {
if (window.innerWidth <= 1024) {
toc.classList.remove('open');
}
});
});
</script>
</body></html>
登录后可参与表态
讨论回复
0 条回复还没有人回复,快来发表你的看法吧!