From d04c85d9599d8f4737ab2e8d4a7627ce04fcd95f Mon Sep 17 00:00:00 2001 From: carry Date: Mon, 8 Jun 2026 21:13:55 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=BC=BA=E5=A4=9A=E6=A8=A1?= =?UTF-8?q?=E6=80=81=E6=94=AF=E6=8C=81=EF=BC=8C=E6=9B=B4=E6=96=B0=20export?= =?UTF-8?q?=20=E5=92=8C=20MediaView=20=E7=BB=84=E4=BB=B6=E4=BB=A5=E5=A4=84?= =?UTF-8?q?=E7=90=86=E5=9B=BE=E7=89=87=20URL=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E6=B6=88=E6=81=AF=E5=86=85=E5=AE=B9=E6=9E=84=E5=BB=BA=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/__tests__/export.test.ts | 99 +++++++++++++++++ src/components/segments/MediaView.tsx | 54 +++++++-- src/data/demos/demo-d.json | 153 +------------------------- src/utils/export.ts | 87 +++++++++++++-- 4 files changed, 231 insertions(+), 162 deletions(-) diff --git a/src/__tests__/export.test.ts b/src/__tests__/export.test.ts index 9ad6288..d00a54a 100644 --- a/src/__tests__/export.test.ts +++ b/src/__tests__/export.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect } from 'vitest' import { exportToOpenAIFormat, segmentToText } from '../utils/export' +import type { OpenAIExport, OpenAIContentPart, OpenAIImageUrlContent } from '../utils/export' import type { PromptEnvelope } from '../types/protocol' describe('segmentToText', () => { @@ -502,4 +503,102 @@ describe('exportToOpenAIFormat', () => { const result = exportToOpenAIFormat(env) expect(result.tools).toBeUndefined() }) + + // ── Multimodal: 图片 URL 应正确传入 OpenAI content 数组 ── + + it('exports user message with image as content array (text + image_url)', () => { + const env: PromptEnvelope = { + version: '1.0', + messages: [ + { + id: '1', + role: 'user', + segments: [ + { kind: 'text', content: '这张图片是什么?' }, + { + kind: 'media', + mediaType: 'image', + url: 'https://cataas.com/cat', + altText: '用户上传的图片', + }, + ], + timestamp: 0, + }, + { + id: '2', + role: 'assistant', + segments: [{ kind: 'text', content: '这是一只猫。' }], + timestamp: 0, + }, + ], + } + const { messages } = exportToOpenAIFormat(env) + expect(messages).toHaveLength(2) + + const userMsg = messages[0] + expect(userMsg.role).toBe('user') + expect(Array.isArray(userMsg.content)).toBe(true) + + const parts = userMsg.content as OpenAIContentPart[] + expect(parts).toHaveLength(2) + expect(parts[0]).toEqual({ type: 'text', text: '这张图片是什么?' }) + expect(parts[1]).toEqual({ + type: 'image_url', + image_url: { url: 'https://cataas.com/cat' }, + }) + + // assistant 仍为纯文本 + expect(messages[1].role).toBe('assistant') + expect(messages[1].content).toBe('这是一只猫。') + }) + + it('exports user message with only image (no text)', () => { + const env: PromptEnvelope = { + version: '1.0', + messages: [ + { + id: '1', + role: 'user', + segments: [ + { + kind: 'media', + mediaType: 'image', + url: 'https://example.com/photo.png', + }, + ], + timestamp: 0, + }, + ], + } + const { messages } = exportToOpenAIFormat(env) + expect(messages).toHaveLength(1) + + const parts = messages[0].content as OpenAIContentPart[] + expect(Array.isArray(parts)).toBe(true) + expect(parts).toHaveLength(1) + const img = parts[0] as OpenAIImageUrlContent + expect(img.type).toBe('image_url') + expect(img.image_url.url).toBe('https://example.com/photo.png') + }) + + it('uses plain string content when no media segment present (backward compat)', () => { + const env: PromptEnvelope = { + version: '1.0', + messages: [ + { + id: '1', + role: 'user', + segments: [ + { kind: 'text', content: 'Hello' }, + { kind: 'text', content: 'World' }, + ], + timestamp: 0, + }, + ], + } + const { messages } = exportToOpenAIFormat(env) + expect(typeof messages[0].content).toBe('string') + expect(messages[0].content).toContain('Hello') + expect(messages[0].content).toContain('World') + }) }) diff --git a/src/components/segments/MediaView.tsx b/src/components/segments/MediaView.tsx index 3b9ec6c..bc2daa7 100644 --- a/src/components/segments/MediaView.tsx +++ b/src/components/segments/MediaView.tsx @@ -1,5 +1,6 @@ +import { useState } from 'react' import type { MediaSegment } from '../../types/protocol' -import { Image, Music, Video } from 'lucide-react' +import { Image, Music, Video, ExternalLink, AlertTriangle } from 'lucide-react' const mediaConfig = { image: { icon: Image, label: '图片', bg: 'bg-blue-50', color: 'text-blue-600' }, @@ -10,17 +11,54 @@ const mediaConfig = { export default function MediaView({ segment }: { segment: MediaSegment }) { const cfg = mediaConfig[segment.mediaType] const Icon = cfg.icon + const [imgError, setImgError] = useState(false) - return ( -
- -
- {cfg.label} + // 图片有 URL → 直接渲染缩略图 + if (segment.mediaType === 'image' && segment.url && !imgError) { + return ( +
+ {segment.altText setImgError(true)} + /> {segment.altText && ( - {segment.altText} +
{segment.altText}
)}
- {segment.mediaType} + ) + } + + // 图片加载失败 / 无 URL / 非图片类型 → 图标占位 + return ( +
+ {imgError ? ( + + ) : ( + + )} +
+ + {imgError ? '图片加载失败' : cfg.label} + + {segment.altText && ( + {segment.altText} + )} +
+ {segment.url ? ( + + + 打开 + + ) : ( + {segment.mediaType} + )}
) } diff --git a/src/data/demos/demo-d.json b/src/data/demos/demo-d.json index b889fc7..de70924 100644 --- a/src/data/demos/demo-d.json +++ b/src/data/demos/demo-d.json @@ -1,6 +1,6 @@ { "version": "1.0", - "model": "gpt-4-turbo", + "model": "gpt-4o", "messages": [ { "id": "d-1", @@ -18,96 +18,9 @@ "value": "中文(简体)", "description": "模型回复的首选语言" }, - { - "kind": "static_var", - "name": "knowledge_cutoff", - "value": "2026年1月", - "description": "模型训练数据截止日期" - }, - { - "kind": "static_var", - "name": "user_name", - "value": "小明", - "description": "当前用户名称" - }, { "kind": "system_prompt", - "content": "当前日期:{{current_date}}\n用户:{{user_name}}\n回复语言:{{language}}\n知识截止:{{knowledge_cutoff}}\n\n你是 Claude,一个 HCI 研究助手。你的角色是帮助学生批判性地思考聊天界面的设计问题。\n\n核心原则:\n- 鼓励从用户体验角度分析,而非技术实现角度\n- 用具体例子说明抽象概念\n- 如果学生的方案有改进空间,以提问的方式引导而非直接批评\n- 始终记住你拥有工具调用、skills 和跨对话 memory 能力,但不必每次都全部用到", - "collapsed": true - }, - { - "kind": "memory", - "description": "以下是从过往对话中总结的关于你的信息,模型会参考这些记忆来个性化回复。记忆随对话自动更新,你也可以手动编辑或删除。", - "items": [ - { "title": "用户身份", "content": "小明,设计系研二,HCI 方向。正在做课程设计项目。" }, - { "title": "项目背景", "content": "设计一个透明化 LLM 上下文的聊天协议。已确定了 9 种 prompt 类型的分类方案。" }, - { "title": "沟通偏好", "content": "喜欢用图示和表格辅助理解。反感过度术语化。需要看到具体例子。" }, - { "title": "上次进度", "content": "用户已确认了 MVP 范围:Web 应用,数据协议+视觉规范,可导出 OpenAI Format。" } - ], - "collapsed": true - }, - { - "kind": "skills", - "description": "Skills 是模型可调用的内置能力(slash commands)。输入 / 开头即可触发,无需等待模型推理。当前对话中可用以下 skills:", - "items": [ - { "name": "deep-research", "description": "深度研究 — 多源搜索、交叉验证、生成引用报告" }, - { "name": "code-review", "description": "代码审查 — 发现正确性 bug 和简化/效率优化机会" }, - { "name": "verify", "description": "行为验证 — 运行应用并观察行为来确认变更生效" }, - { "name": "simplify", "description": "代码简化 — 审查代码的复用性、简洁性和效率并应用修复" }, - { "name": "loop", "description": "定时循环 — 按指定间隔重复执行一个命令或 prompt" } - ], - "collapsed": true - }, - { - "kind": "tool_overview", - "items": [ - { - "name": "search", - "description": "搜索学术文献和设计案例", - "parameters": "query: string, limit?: number", - "schema": { - "type": "object", - "properties": { - "query": { "type": "string", "description": "搜索关键词" }, - "limit": { "type": "number", "description": "返回结果数量" } - }, - "required": ["query"] - } - }, - { - "name": "read_file", - "description": "读取文件内容", - "parameters": "path: string", - "schema": { - "type": "object", - "properties": { "path": { "type": "string", "description": "文件路径" } }, - "required": ["path"] - } - }, - { - "name": "fetch_url", - "description": "获取网页内容并转为 markdown", - "parameters": "url: string", - "schema": { - "type": "object", - "properties": { "url": { "type": "string", "description": "网页 URL" } }, - "required": ["url"] - } - }, - { - "name": "run_code", - "description": "在沙箱中执行代码", - "parameters": "language: string, code: string", - "schema": { - "type": "object", - "properties": { - "language": { "type": "string", "enum": ["python", "javascript", "r"], "description": "编程语言" }, - "code": { "type": "string", "description": "代码内容" } - }, - "required": ["language", "code"] - } - } - ], + "content": "当前日期:{{current_date}}\n回复语言:{{language}}\n\n你是一个多模态助手,支持图文理解。用户可以在消息中附带图片,你能够直接理解图片内容并做出回复。", "collapsed": true } ], @@ -119,26 +32,13 @@ "segments": [ { "kind": "text", - "content": "你好!我在准备课程设计的文献综述部分。我找到了一篇相关的研究报告,帮我分析一下它是否可以支持我的论点。" - }, - { - "kind": "long_text", - "content": "在过去的18个月里,我们对127名LLM聊天产品用户进行了纵向研究。研究分为三个阶段:基线观察期(3个月)、干预实验期(6个月)、后续跟踪期(9个月)。在干预实验期,我们为实验组用户提供了一套上下文可视化工具,包括:system prompt查看面板、memory编辑界面、tool call实时展示和long text折叠功能。对照组使用标准的聊天界面。\n\n实验组在以下维度上表现出显著改善:\n- 任务完成率提升 23.4%(p < 0.01)\n- 纠错响应时间缩短 41.7%(p < 0.001)\n- 用户信任度评分从 3.2/5 提升至 4.1/5\n- 对话轮次平均减少 2.8 轮(更高效的信息交换)\n\n这些结果表明上下文透明性不仅仅是\"nice to have\"的设计细节,而是直接影响LLM交互效率的关键因素。特别是在以下场景中效果最为显著:\n1. 长文档分析:用户能够看到哪些文档片段被模型引用\n2. 多工具调用:工具链的可视化帮助用户理解推理过程\n3. 跨会话任务:memory可见性减少重复说明\n\n我们建议将上下文透明性作为LLM聊天产品的基础设计原则,而非可选特性。", - "charCount": 567, - "collapsed": true - }, - { - "kind": "document", - "fileName": "context-transparency-study-2025.pdf", - "mimeType": "application/pdf", - "snippet": "DOI: 10.1145/3613904.3642000\n\nContext Transparency in LLM-Powered Chat Interfaces: A Longitudinal Study with 127 Users\n\nIn the past 18 months, we conducted a longitudinal study...", - "sizeBytes": 2457600 + "content": "这张图片是什么?" }, { "kind": "media", "mediaType": "image", - "url": "", - "altText": "Fig 3: 实验组 vs 对照组任务完成率对比图(p<0.01)" + "url": "https://cataas.com/cat", + "altText": "用户上传的图片" } ], "timestamp": 1780897900000 @@ -149,51 +49,10 @@ "segments": [ { "kind": "text", - "content": "这是一篇非常有力的参考文献!它恰恰支持了你的核心论点——上下文透明性是“基础设计原则,而非可选特性”。" - } - ], - "timestamp": 1780897920000 - }, - { - "id": "d-4", - "role": "assistant", - "segments": [ - { - "kind": "tool_call_request", - "toolName": "search", - "arguments": { - "query": "LLM context transparency HCI chat interface user study 2024 2025", - "limit": 5 - }, - "collapsed": false - } - ], - "timestamp": 1780897930000 - }, - { - "id": "d-5", - "role": "assistant", - "segments": [ - { - "kind": "tool_call_result", - "toolName": "search", - "success": true, - "result": "Found 5 results:\n\n1. \"Transparent AI: Designing Explainable Chat Interfaces\" — CHI 2024\n → 提出了\"分层透明度\"框架,与本项目的\"信息密度梯度\"高度一致\n\n2. \"User Mental Models of LLM Context\" — CSCW 2025\n → 发现 78% 的用户低估了 system prompt 对输出的影响\n\n3. \"Folding as a Visual Strategy for Complex Chat UIs\" — DIS 2024\n → 验证了折叠/展开对降低认知负荷的效果\n\n4. \"Beyond the Black Box: Visualizing LLM Reasoning in Chat\" — TOCHI 2025\n → 工具调用可视化的用户研究,与本项目的 tool call request/result 设计一致\n\n5. \"Memory-Aware Interfaces for Persistent LLM Interactions\" — IUI 2025\n → 讨论了用户记忆的可编辑性和可遗忘性", - "collapsed": true + "content": "这是一只猫。" } ], "timestamp": 1780897950000 - }, - { - "id": "d-6", - "role": "assistant", - "segments": [ - { - "kind": "text", - "content": "搜索结果显示,你的研究方向和当前学术界的热点高度一致。以下是我对文献支持度的评估:\n\n**强相关(可直接引用):**\n- 第2篇(CSCW 2025)直接量化了用户对 LLM 上下文的理解不足——这是你\"问题定义\"部分的关键论据\n- 第3篇(DIS 2024)为你的\"折叠设计\"提供了实证基础\n\n**建议阅读方向:**\n- 第1篇的\"分层透明度\"框架可以与你的 9 种类型分类做对比讨论\n- 第5篇的 memory 可编辑性可能是你下一个值得探索的方向\n\n这篇研究报告 + 这5篇文献构成了一个很好的文献基础。需要我帮你整理成引用格式吗?" - } - ], - "timestamp": 1780898000000 } ] } diff --git a/src/utils/export.ts b/src/utils/export.ts index 4b01ea4..e47420c 100644 --- a/src/utils/export.ts +++ b/src/utils/export.ts @@ -11,9 +11,26 @@ export interface OpenAIToolCall { } } +/** 多模态 content part:文本 */ +export interface OpenAITextContent { + type: 'text' + text: string +} + +/** 多模态 content part:图片 URL */ +export interface OpenAIImageUrlContent { + type: 'image_url' + image_url: { + url: string + detail?: 'auto' | 'low' | 'high' + } +} + +export type OpenAIContentPart = OpenAITextContent | OpenAIImageUrlContent + export interface OpenAIMessage { role: 'system' | 'user' | 'assistant' | 'tool' - content?: string | null + content?: string | null | OpenAIContentPart[] tool_calls?: OpenAIToolCall[] tool_call_id?: string } @@ -115,6 +132,66 @@ function toolItemToOpenAI(item: ToolItem): OpenAITool { } } +/** + * 将消息的 segment 列表构建为 OpenAI 消息的 content 字段。 + * + * - 纯文本消息 → 返回 string + * - 含图片/音频/视频的消息 → 返回 OpenAIContentPart[](多模态 content 数组) + */ +function buildUserContent(segments: Segment[]): string | OpenAIContentPart[] | null { + const hasMedia = segments.some((s) => s.kind === 'media') + + if (!hasMedia) { + // 纯文本路径:与 segmentToText 行为一致 + const text = segments + .map(segmentToText) + .filter(Boolean) + .join('\n') + return text || null + } + + // 多模态路径:构建 content part 数组 + const parts: OpenAIContentPart[] = [] + for (const seg of segments) { + switch (seg.kind) { + case 'text': + parts.push({ type: 'text', text: seg.content }) + break + case 'long_text': + parts.push({ type: 'text', text: seg.content }) + break + case 'static_var': + parts.push({ type: 'text', text: seg.value }) + break + case 'document': + parts.push({ + type: 'text', + text: `[Document: ${seg.fileName} (${formatBytes(seg.sizeBytes)})]\n${seg.snippet}`, + }) + break + case 'media': + if (seg.mediaType === 'image') { + parts.push({ + type: 'image_url', + image_url: { url: seg.url }, + }) + } else { + // audio / video —— OpenAI 原生支持有限,用 altText 做文本回退 + parts.push({ + type: 'text', + text: seg.altText ?? `[${seg.mediaType}]`, + }) + } + break + default: + // 结构性 segment (system_prompt / memory / skills / tool_overview) + // 以及 tool_call* —— 不在 user content 中出现 + break + } + } + return parts.length > 0 ? parts : null +} + // --- Main export function --- /** @@ -194,15 +271,11 @@ export function exportToOpenAIFormat(envelope: PromptEnvelope): OpenAIExport { if (seg.kind === 'tool_overview') { seg.items.forEach(item => tools.push(toolItemToOpenAI(item))) } - if (seg.kind === 'static_var') continue // 静态变量仅通过 segmentToText 展开到消息内容,不写入 system prompt const s = formatStructural(seg) if (s) systemParts.push(s) } - const content = msg.segments - .map(segmentToText) - .filter(Boolean) - .join('\n') - if (content.trim()) { + const content = buildUserContent(msg.segments) + if (content) { messages.push({ role: 'user', content }) } continue