feat: 增强多模态支持,更新 export 和 MediaView 组件以处理图片 URL,优化消息内容构建逻辑

This commit is contained in:
carry
2026-06-08 21:13:55 +08:00
parent 742659df43
commit d04c85d959
4 changed files with 231 additions and 162 deletions
+99
View File
@@ -1,5 +1,6 @@
import { describe, it, expect } from 'vitest' import { describe, it, expect } from 'vitest'
import { exportToOpenAIFormat, segmentToText } from '../utils/export' import { exportToOpenAIFormat, segmentToText } from '../utils/export'
import type { OpenAIExport, OpenAIContentPart, OpenAIImageUrlContent } from '../utils/export'
import type { PromptEnvelope } from '../types/protocol' import type { PromptEnvelope } from '../types/protocol'
describe('segmentToText', () => { describe('segmentToText', () => {
@@ -502,4 +503,102 @@ describe('exportToOpenAIFormat', () => {
const result = exportToOpenAIFormat(env) const result = exportToOpenAIFormat(env)
expect(result.tools).toBeUndefined() expect(result.tools).toBeUndefined()
}) })
// ── Multimodal: 图片 URL 应正确传入 OpenAI content 数组 ──
it('exports user message with image as content array (text + image_url)', () => {
const env: PromptEnvelope = {
version: '1.0',
messages: [
{
id: '1',
role: 'user',
segments: [
{ kind: 'text', content: '这张图片是什么?' },
{
kind: 'media',
mediaType: 'image',
url: 'https://cataas.com/cat',
altText: '用户上传的图片',
},
],
timestamp: 0,
},
{
id: '2',
role: 'assistant',
segments: [{ kind: 'text', content: '这是一只猫。' }],
timestamp: 0,
},
],
}
const { messages } = exportToOpenAIFormat(env)
expect(messages).toHaveLength(2)
const userMsg = messages[0]
expect(userMsg.role).toBe('user')
expect(Array.isArray(userMsg.content)).toBe(true)
const parts = userMsg.content as OpenAIContentPart[]
expect(parts).toHaveLength(2)
expect(parts[0]).toEqual({ type: 'text', text: '这张图片是什么?' })
expect(parts[1]).toEqual({
type: 'image_url',
image_url: { url: 'https://cataas.com/cat' },
})
// assistant 仍为纯文本
expect(messages[1].role).toBe('assistant')
expect(messages[1].content).toBe('这是一只猫。')
})
it('exports user message with only image (no text)', () => {
const env: PromptEnvelope = {
version: '1.0',
messages: [
{
id: '1',
role: 'user',
segments: [
{
kind: 'media',
mediaType: 'image',
url: 'https://example.com/photo.png',
},
],
timestamp: 0,
},
],
}
const { messages } = exportToOpenAIFormat(env)
expect(messages).toHaveLength(1)
const parts = messages[0].content as OpenAIContentPart[]
expect(Array.isArray(parts)).toBe(true)
expect(parts).toHaveLength(1)
const img = parts[0] as OpenAIImageUrlContent
expect(img.type).toBe('image_url')
expect(img.image_url.url).toBe('https://example.com/photo.png')
})
it('uses plain string content when no media segment present (backward compat)', () => {
const env: PromptEnvelope = {
version: '1.0',
messages: [
{
id: '1',
role: 'user',
segments: [
{ kind: 'text', content: 'Hello' },
{ kind: 'text', content: 'World' },
],
timestamp: 0,
},
],
}
const { messages } = exportToOpenAIFormat(env)
expect(typeof messages[0].content).toBe('string')
expect(messages[0].content).toContain('Hello')
expect(messages[0].content).toContain('World')
})
}) })
+46 -8
View File
@@ -1,5 +1,6 @@
import { useState } from 'react'
import type { MediaSegment } from '../../types/protocol' import type { MediaSegment } from '../../types/protocol'
import { Image, Music, Video } from 'lucide-react' import { Image, Music, Video, ExternalLink, AlertTriangle } from 'lucide-react'
const mediaConfig = { const mediaConfig = {
image: { icon: Image, label: '图片', bg: 'bg-blue-50', color: 'text-blue-600' }, image: { icon: Image, label: '图片', bg: 'bg-blue-50', color: 'text-blue-600' },
@@ -10,17 +11,54 @@ const mediaConfig = {
export default function MediaView({ segment }: { segment: MediaSegment }) { export default function MediaView({ segment }: { segment: MediaSegment }) {
const cfg = mediaConfig[segment.mediaType] const cfg = mediaConfig[segment.mediaType]
const Icon = cfg.icon const Icon = cfg.icon
const [imgError, setImgError] = useState(false)
// 图片有 URL → 直接渲染缩略图
if (segment.mediaType === 'image' && segment.url && !imgError) {
return ( return (
<div className={`my-2 flex items-center gap-3 rounded-lg border px-3 py-2.5 ${cfg.bg} border-gray-200`}> <div className="my-2">
<Icon size={20} className={cfg.color} /> <img
<div className="flex-1"> src={segment.url}
<span className={`text-sm font-medium ${cfg.color}`}>{cfg.label}</span> alt={segment.altText || '图片'}
className="max-w-full max-h-80 rounded-lg border border-gray-200 object-cover"
onError={() => setImgError(true)}
/>
{segment.altText && ( {segment.altText && (
<span className="text-xs text-gray-500 ml-2">{segment.altText}</span> <div className="mt-1 text-xs text-gray-400">{segment.altText}</div>
)}
</div>
)
}
// 图片加载失败 / 无 URL / 非图片类型 → 图标占位
return (
<div className={`my-2 flex items-center gap-3 rounded-lg border px-3 py-2.5 ${cfg.bg} border-gray-200`}>
{imgError ? (
<AlertTriangle size={20} className="text-amber-500" />
) : (
<Icon size={20} className={cfg.color} />
)}
<div className="flex-1 min-w-0">
<span className={`text-sm font-medium ${imgError ? 'text-amber-600' : cfg.color}`}>
{imgError ? '图片加载失败' : cfg.label}
</span>
{segment.altText && (
<span className="text-xs text-gray-500 ml-2 truncate">{segment.altText}</span>
)}
</div>
{segment.url ? (
<a
href={segment.url}
target="_blank"
rel="noopener noreferrer"
className="flex items-center gap-1 text-xs text-gray-400 hover:text-gray-600 transition-colors"
>
<ExternalLink size={12} />
</a>
) : (
<span className="text-xs text-gray-400">{segment.mediaType}</span>
)} )}
</div> </div>
<span className="text-xs text-gray-400">{segment.mediaType}</span>
</div>
) )
} }
+6 -147
View File
@@ -1,6 +1,6 @@
{ {
"version": "1.0", "version": "1.0",
"model": "gpt-4-turbo", "model": "gpt-4o",
"messages": [ "messages": [
{ {
"id": "d-1", "id": "d-1",
@@ -18,96 +18,9 @@
"value": "中文(简体)", "value": "中文(简体)",
"description": "模型回复的首选语言" "description": "模型回复的首选语言"
}, },
{
"kind": "static_var",
"name": "knowledge_cutoff",
"value": "2026年1月",
"description": "模型训练数据截止日期"
},
{
"kind": "static_var",
"name": "user_name",
"value": "小明",
"description": "当前用户名称"
},
{ {
"kind": "system_prompt", "kind": "system_prompt",
"content": "当前日期:{{current_date}}\n用户:{{user_name}}\n回复语言:{{language}}\n知识截止:{{knowledge_cutoff}}\n\n你是 Claude,一个 HCI 研究助手。你的角色是帮助学生批判性地思考聊天界面的设计问题。\n\n核心原则:\n- 鼓励从用户体验角度分析,而非技术实现角度\n- 用具体例子说明抽象概念\n- 如果学生的方案有改进空间,以提问的方式引导而非直接批评\n- 始终记住你拥有工具调用、skills 和跨对话 memory 能力,但不必每次都全部用到", "content": "当前日期:{{current_date}}\n回复语言:{{language}}\n\n你是一个多模态助手,支持图文理解。用户可以在消息中附带图片,你能够直接理解图片内容并做出回复。",
"collapsed": true
},
{
"kind": "memory",
"description": "以下是从过往对话中总结的关于你的信息,模型会参考这些记忆来个性化回复。记忆随对话自动更新,你也可以手动编辑或删除。",
"items": [
{ "title": "用户身份", "content": "小明,设计系研二,HCI 方向。正在做课程设计项目。" },
{ "title": "项目背景", "content": "设计一个透明化 LLM 上下文的聊天协议。已确定了 9 种 prompt 类型的分类方案。" },
{ "title": "沟通偏好", "content": "喜欢用图示和表格辅助理解。反感过度术语化。需要看到具体例子。" },
{ "title": "上次进度", "content": "用户已确认了 MVP 范围:Web 应用,数据协议+视觉规范,可导出 OpenAI Format。" }
],
"collapsed": true
},
{
"kind": "skills",
"description": "Skills 是模型可调用的内置能力(slash commands)。输入 / 开头即可触发,无需等待模型推理。当前对话中可用以下 skills:",
"items": [
{ "name": "deep-research", "description": "深度研究 — 多源搜索、交叉验证、生成引用报告" },
{ "name": "code-review", "description": "代码审查 — 发现正确性 bug 和简化/效率优化机会" },
{ "name": "verify", "description": "行为验证 — 运行应用并观察行为来确认变更生效" },
{ "name": "simplify", "description": "代码简化 — 审查代码的复用性、简洁性和效率并应用修复" },
{ "name": "loop", "description": "定时循环 — 按指定间隔重复执行一个命令或 prompt" }
],
"collapsed": true
},
{
"kind": "tool_overview",
"items": [
{
"name": "search",
"description": "搜索学术文献和设计案例",
"parameters": "query: string, limit?: number",
"schema": {
"type": "object",
"properties": {
"query": { "type": "string", "description": "搜索关键词" },
"limit": { "type": "number", "description": "返回结果数量" }
},
"required": ["query"]
}
},
{
"name": "read_file",
"description": "读取文件内容",
"parameters": "path: string",
"schema": {
"type": "object",
"properties": { "path": { "type": "string", "description": "文件路径" } },
"required": ["path"]
}
},
{
"name": "fetch_url",
"description": "获取网页内容并转为 markdown",
"parameters": "url: string",
"schema": {
"type": "object",
"properties": { "url": { "type": "string", "description": "网页 URL" } },
"required": ["url"]
}
},
{
"name": "run_code",
"description": "在沙箱中执行代码",
"parameters": "language: string, code: string",
"schema": {
"type": "object",
"properties": {
"language": { "type": "string", "enum": ["python", "javascript", "r"], "description": "编程语言" },
"code": { "type": "string", "description": "代码内容" }
},
"required": ["language", "code"]
}
}
],
"collapsed": true "collapsed": true
} }
], ],
@@ -119,26 +32,13 @@
"segments": [ "segments": [
{ {
"kind": "text", "kind": "text",
"content": "你好!我在准备课程设计的文献综述部分。我找到了一篇相关的研究报告,帮我分析一下它是否可以支持我的论点。" "content": "这张图片是什么?"
},
{
"kind": "long_text",
"content": "在过去的18个月里,我们对127名LLM聊天产品用户进行了纵向研究。研究分为三个阶段:基线观察期(3个月)、干预实验期(6个月)、后续跟踪期(9个月)。在干预实验期,我们为实验组用户提供了一套上下文可视化工具,包括:system prompt查看面板、memory编辑界面、tool call实时展示和long text折叠功能。对照组使用标准的聊天界面。\n\n实验组在以下维度上表现出显著改善:\n- 任务完成率提升 23.4%p < 0.01\n- 纠错响应时间缩短 41.7%p < 0.001\n- 用户信任度评分从 3.2/5 提升至 4.1/5\n- 对话轮次平均减少 2.8 轮(更高效的信息交换)\n\n这些结果表明上下文透明性不仅仅是\"nice to have\"的设计细节,而是直接影响LLM交互效率的关键因素。特别是在以下场景中效果最为显著:\n1. 长文档分析:用户能够看到哪些文档片段被模型引用\n2. 多工具调用:工具链的可视化帮助用户理解推理过程\n3. 跨会话任务:memory可见性减少重复说明\n\n我们建议将上下文透明性作为LLM聊天产品的基础设计原则,而非可选特性。",
"charCount": 567,
"collapsed": true
},
{
"kind": "document",
"fileName": "context-transparency-study-2025.pdf",
"mimeType": "application/pdf",
"snippet": "DOI: 10.1145/3613904.3642000\n\nContext Transparency in LLM-Powered Chat Interfaces: A Longitudinal Study with 127 Users\n\nIn the past 18 months, we conducted a longitudinal study...",
"sizeBytes": 2457600
}, },
{ {
"kind": "media", "kind": "media",
"mediaType": "image", "mediaType": "image",
"url": "", "url": "https://cataas.com/cat",
"altText": "Fig 3: 实验组 vs 对照组任务完成率对比图(p<0.01)" "altText": "用户上传的图片"
} }
], ],
"timestamp": 1780897900000 "timestamp": 1780897900000
@@ -149,51 +49,10 @@
"segments": [ "segments": [
{ {
"kind": "text", "kind": "text",
"content": "这是一篇非常有力的参考文献!它恰恰支持了你的核心论点——上下文透明性是“基础设计原则,而非可选特性”。" "content": "这是一只猫。"
}
],
"timestamp": 1780897920000
},
{
"id": "d-4",
"role": "assistant",
"segments": [
{
"kind": "tool_call_request",
"toolName": "search",
"arguments": {
"query": "LLM context transparency HCI chat interface user study 2024 2025",
"limit": 5
},
"collapsed": false
}
],
"timestamp": 1780897930000
},
{
"id": "d-5",
"role": "assistant",
"segments": [
{
"kind": "tool_call_result",
"toolName": "search",
"success": true,
"result": "Found 5 results:\n\n1. \"Transparent AI: Designing Explainable Chat Interfaces\" — CHI 2024\n → 提出了\"分层透明度\"框架,与本项目的\"信息密度梯度\"高度一致\n\n2. \"User Mental Models of LLM Context\" — CSCW 2025\n → 发现 78% 的用户低估了 system prompt 对输出的影响\n\n3. \"Folding as a Visual Strategy for Complex Chat UIs\" — DIS 2024\n → 验证了折叠/展开对降低认知负荷的效果\n\n4. \"Beyond the Black Box: Visualizing LLM Reasoning in Chat\" — TOCHI 2025\n → 工具调用可视化的用户研究,与本项目的 tool call request/result 设计一致\n\n5. \"Memory-Aware Interfaces for Persistent LLM Interactions\" — IUI 2025\n → 讨论了用户记忆的可编辑性和可遗忘性",
"collapsed": true
} }
], ],
"timestamp": 1780897950000 "timestamp": 1780897950000
},
{
"id": "d-6",
"role": "assistant",
"segments": [
{
"kind": "text",
"content": "搜索结果显示,你的研究方向和当前学术界的热点高度一致。以下是我对文献支持度的评估:\n\n**强相关(可直接引用):**\n- 第2篇(CSCW 2025)直接量化了用户对 LLM 上下文的理解不足——这是你\"问题定义\"部分的关键论据\n- 第3篇(DIS 2024)为你的\"折叠设计\"提供了实证基础\n\n**建议阅读方向:**\n- 第1篇的\"分层透明度\"框架可以与你的 9 种类型分类做对比讨论\n- 第5篇的 memory 可编辑性可能是你下一个值得探索的方向\n\n这篇研究报告 + 这5篇文献构成了一个很好的文献基础。需要我帮你整理成引用格式吗?"
}
],
"timestamp": 1780898000000
} }
] ]
} }
+80 -7
View File
@@ -11,9 +11,26 @@ export interface OpenAIToolCall {
} }
} }
/** 多模态 content part:文本 */
export interface OpenAITextContent {
type: 'text'
text: string
}
/** 多模态 content part:图片 URL */
export interface OpenAIImageUrlContent {
type: 'image_url'
image_url: {
url: string
detail?: 'auto' | 'low' | 'high'
}
}
export type OpenAIContentPart = OpenAITextContent | OpenAIImageUrlContent
export interface OpenAIMessage { export interface OpenAIMessage {
role: 'system' | 'user' | 'assistant' | 'tool' role: 'system' | 'user' | 'assistant' | 'tool'
content?: string | null content?: string | null | OpenAIContentPart[]
tool_calls?: OpenAIToolCall[] tool_calls?: OpenAIToolCall[]
tool_call_id?: string tool_call_id?: string
} }
@@ -115,6 +132,66 @@ function toolItemToOpenAI(item: ToolItem): OpenAITool {
} }
} }
/**
* 将消息的 segment 列表构建为 OpenAI 消息的 content 字段。
*
* - 纯文本消息 → 返回 string
* - 含图片/音频/视频的消息 → 返回 OpenAIContentPart[](多模态 content 数组)
*/
function buildUserContent(segments: Segment[]): string | OpenAIContentPart[] | null {
const hasMedia = segments.some((s) => s.kind === 'media')
if (!hasMedia) {
// 纯文本路径:与 segmentToText 行为一致
const text = segments
.map(segmentToText)
.filter(Boolean)
.join('\n')
return text || null
}
// 多模态路径:构建 content part 数组
const parts: OpenAIContentPart[] = []
for (const seg of segments) {
switch (seg.kind) {
case 'text':
parts.push({ type: 'text', text: seg.content })
break
case 'long_text':
parts.push({ type: 'text', text: seg.content })
break
case 'static_var':
parts.push({ type: 'text', text: seg.value })
break
case 'document':
parts.push({
type: 'text',
text: `[Document: ${seg.fileName} (${formatBytes(seg.sizeBytes)})]\n${seg.snippet}`,
})
break
case 'media':
if (seg.mediaType === 'image') {
parts.push({
type: 'image_url',
image_url: { url: seg.url },
})
} else {
// audio / video —— OpenAI 原生支持有限,用 altText 做文本回退
parts.push({
type: 'text',
text: seg.altText ?? `[${seg.mediaType}]`,
})
}
break
default:
// 结构性 segment (system_prompt / memory / skills / tool_overview)
// 以及 tool_call* —— 不在 user content 中出现
break
}
}
return parts.length > 0 ? parts : null
}
// --- Main export function --- // --- Main export function ---
/** /**
@@ -194,15 +271,11 @@ export function exportToOpenAIFormat(envelope: PromptEnvelope): OpenAIExport {
if (seg.kind === 'tool_overview') { if (seg.kind === 'tool_overview') {
seg.items.forEach(item => tools.push(toolItemToOpenAI(item))) seg.items.forEach(item => tools.push(toolItemToOpenAI(item)))
} }
if (seg.kind === 'static_var') continue // 静态变量仅通过 segmentToText 展开到消息内容,不写入 system prompt
const s = formatStructural(seg) const s = formatStructural(seg)
if (s) systemParts.push(s) if (s) systemParts.push(s)
} }
const content = msg.segments const content = buildUserContent(msg.segments)
.map(segmentToText) if (content) {
.filter(Boolean)
.join('\n')
if (content.trim()) {
messages.push({ role: 'user', content }) messages.push({ role: 'user', content })
} }
continue continue