feat(vision): prioritize main model for image recognition with multi-provider fallback

- Add call_vision method to all bot implementations (DashScope, Claude,
  Gemini, ZhipuAI, MiniMax, Doubao, Moonshot, OpenAICompatibleBot)
  using each vendor's native multimodal API format
- Remove call_with_tools/call_vision from Bot base class to fix MRO
  shadowing issue with OpenAICompatibleBot mixin
- Refactor vision tool provider resolution: MainModel → other configured
  models (auto-discovered) → OpenAI → LinkAI, with automatic fallback
- Return actual model name used in call_vision responses
- Sync config.json API keys to .env bidirectionally on startup
- Fix bot instance cache to detect bot_type/use_linkai config changes
- Add SSE reconnection support for web console
- Preserve image path hints in Gemini text for correct vision tool calls
- Update docs/tools/vision.mdx
This commit is contained in:
zhayujie
2026-04-11 19:46:11 +08:00
parent 3cd92ccda3
commit 26693acc3f
17 changed files with 1173 additions and 359 deletions

View File

@@ -806,15 +806,17 @@ function sendMessage() {
}
function startSSE(requestId, loadingEl, timestamp) {
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
activeStreams[requestId] = es;
let botEl = null;
let stepsEl = null; // .agent-steps (thinking summaries + tool indicators)
let contentEl = null; // .answer-content (final streaming answer)
let mediaEl = null; // .media-content (images & file attachments)
let accumulatedText = '';
let currentToolEl = null;
let done = false;
const MAX_RECONNECTS = 10;
const RECONNECT_BASE_MS = 1000;
let reconnectCount = 0;
function ensureBotEl() {
if (botEl) return;
@@ -839,180 +841,204 @@ function startSSE(requestId, loadingEl, timestamp) {
mediaEl = botEl.querySelector('.media-content');
}
es.onmessage = function(e) {
let item;
try { item = JSON.parse(e.data); } catch (_) { return; }
function connect() {
const es = new EventSource(`/stream?request_id=${encodeURIComponent(requestId)}`);
activeStreams[requestId] = es;
if (item.type === 'delta') {
ensureBotEl();
accumulatedText += item.content;
contentEl.innerHTML = renderMarkdown(accumulatedText);
scrollChatToBottom();
es.onmessage = function(e) {
let item;
try { item = JSON.parse(e.data); } catch (_) { return; }
} else if (item.type === 'tool_start') {
ensureBotEl();
// Successful data received, reset reconnect counter
reconnectCount = 0;
// Save current thinking as a collapsible step
if (accumulatedText.trim()) {
const fullText = accumulatedText.trim();
const oneLine = fullText.replace(/\n+/g, ' ');
const needsTruncate = oneLine.length > 80;
const stepEl = document.createElement('div');
stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
if (needsTruncate) {
const truncated = oneLine.substring(0, 80) + '…';
stepEl.innerHTML = `
<div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
<span class="thinking-summary">${escapeHtml(truncated)}</span>
<i class="fas fa-chevron-right thinking-chevron"></i>
</div>
<div class="thinking-full">${renderMarkdown(fullText)}</div>`;
} else {
stepEl.innerHTML = `
<div class="thinking-header no-toggle">
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
<span>${escapeHtml(oneLine)}</span>
</div>`;
if (item.type === 'delta') {
ensureBotEl();
accumulatedText += item.content;
contentEl.innerHTML = renderMarkdown(accumulatedText);
scrollChatToBottom();
} else if (item.type === 'tool_start') {
ensureBotEl();
// Save current thinking as a collapsible step
if (accumulatedText.trim()) {
const fullText = accumulatedText.trim();
const oneLine = fullText.replace(/\n+/g, ' ');
const needsTruncate = oneLine.length > 80;
const stepEl = document.createElement('div');
stepEl.className = 'agent-step agent-thinking-step' + (needsTruncate ? '' : ' no-expand');
if (needsTruncate) {
const truncated = oneLine.substring(0, 80) + '…';
stepEl.innerHTML = `
<div class="thinking-header" onclick="this.parentElement.classList.toggle('expanded')">
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
<span class="thinking-summary">${escapeHtml(truncated)}</span>
<i class="fas fa-chevron-right thinking-chevron"></i>
</div>
<div class="thinking-full">${renderMarkdown(fullText)}</div>`;
} else {
stepEl.innerHTML = `
<div class="thinking-header no-toggle">
<i class="fas fa-lightbulb text-amber-400 flex-shrink-0"></i>
<span>${escapeHtml(oneLine)}</span>
</div>`;
}
stepsEl.appendChild(stepEl);
}
stepsEl.appendChild(stepEl);
}
accumulatedText = '';
contentEl.innerHTML = '';
accumulatedText = '';
contentEl.innerHTML = '';
// Add tool execution indicator (collapsible)
currentToolEl = document.createElement('div');
currentToolEl.className = 'agent-step agent-tool-step';
const argsStr = formatToolArgs(item.arguments || {});
currentToolEl.innerHTML = `
<div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
<i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
<span class="tool-name">${item.tool}</span>
<i class="fas fa-chevron-right tool-chevron"></i>
</div>
<div class="tool-detail">
<div class="tool-detail-section">
<div class="tool-detail-label">Input</div>
<pre class="tool-detail-content">${argsStr}</pre>
// Add tool execution indicator (collapsible)
currentToolEl = document.createElement('div');
currentToolEl.className = 'agent-step agent-tool-step';
const argsStr = formatToolArgs(item.arguments || {});
currentToolEl.innerHTML = `
<div class="tool-header" onclick="this.parentElement.classList.toggle('expanded')">
<i class="fas fa-cog fa-spin text-primary-400 flex-shrink-0 tool-icon"></i>
<span class="tool-name">${item.tool}</span>
<i class="fas fa-chevron-right tool-chevron"></i>
</div>
<div class="tool-detail-section tool-output-section"></div>
</div>`;
stepsEl.appendChild(currentToolEl);
<div class="tool-detail">
<div class="tool-detail-section">
<div class="tool-detail-label">Input</div>
<pre class="tool-detail-content">${argsStr}</pre>
</div>
<div class="tool-detail-section tool-output-section"></div>
</div>`;
stepsEl.appendChild(currentToolEl);
scrollChatToBottom();
scrollChatToBottom();
} else if (item.type === 'tool_end') {
if (currentToolEl) {
const isError = item.status !== 'success';
const icon = currentToolEl.querySelector('.tool-icon');
icon.className = isError
? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
: 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
} else if (item.type === 'tool_end') {
if (currentToolEl) {
const isError = item.status !== 'success';
const icon = currentToolEl.querySelector('.tool-icon');
icon.className = isError
? 'fas fa-times text-red-400 flex-shrink-0 tool-icon'
: 'fas fa-check text-primary-400 flex-shrink-0 tool-icon';
// Show execution time
const nameEl = currentToolEl.querySelector('.tool-name');
if (item.execution_time !== undefined) {
nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
// Show execution time
const nameEl = currentToolEl.querySelector('.tool-name');
if (item.execution_time !== undefined) {
nameEl.innerHTML += ` <span class="tool-time">${item.execution_time}s</span>`;
}
// Fill output section
const outputSection = currentToolEl.querySelector('.tool-output-section');
if (outputSection && item.result) {
outputSection.innerHTML = `
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
}
if (isError) currentToolEl.classList.add('tool-failed');
currentToolEl = null;
}
// Fill output section
const outputSection = currentToolEl.querySelector('.tool-output-section');
if (outputSection && item.result) {
outputSection.innerHTML = `
<div class="tool-detail-label">${isError ? 'Error' : 'Output'}</div>
<pre class="tool-detail-content ${isError ? 'tool-error-text' : ''}">${escapeHtml(String(item.result))}</pre>`;
}
} else if (item.type === 'image') {
ensureBotEl();
const imgEl = document.createElement('img');
imgEl.src = item.content;
imgEl.alt = 'screenshot';
imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
imgEl.onclick = () => window.open(item.content, '_blank');
mediaEl.appendChild(imgEl);
scrollChatToBottom();
if (isError) currentToolEl.classList.add('tool-failed');
currentToolEl = null;
} else if (item.type === 'text') {
// Intermediate text sent before media items; display it but keep SSE open.
ensureBotEl();
contentEl.classList.remove('sse-streaming');
const textContent = item.content || accumulatedText;
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
applyHighlighting(botEl);
scrollChatToBottom();
} else if (item.type === 'video') {
ensureBotEl();
const wrapper = document.createElement('div');
wrapper.innerHTML = _buildVideoHtml(item.content);
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
scrollChatToBottom();
} else if (item.type === 'file') {
ensureBotEl();
const fileName = item.file_name || item.content.split('/').pop();
const fileEl = document.createElement('a');
fileEl.href = item.content;
fileEl.download = fileName;
fileEl.target = '_blank';
fileEl.className = 'file-attachment';
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
mediaEl.appendChild(fileEl);
scrollChatToBottom();
} else if (item.type === 'phase') {
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
ensureBotEl();
const wrap = document.createElement('div');
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
wrap.textContent = String(item.content || '');
stepsEl.appendChild(wrap);
scrollChatToBottom();
} else if (item.type === 'done') {
done = true;
es.close();
delete activeStreams[requestId];
// item.content may be empty when "done" is only a stream-close signal after media.
const finalText = item.content || accumulatedText;
if (!botEl && finalText) {
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
} else if (botEl) {
contentEl.classList.remove('sse-streaming');
// Only update text content when there is something new to show.
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
applyHighlighting(botEl);
}
scrollChatToBottom();
} else if (item.type === 'error') {
done = true;
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(t('error_send'), new Date());
}
};
} else if (item.type === 'image') {
ensureBotEl();
const imgEl = document.createElement('img');
imgEl.src = item.content;
imgEl.alt = 'screenshot';
imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);';
imgEl.onclick = () => window.open(item.content, '_blank');
mediaEl.appendChild(imgEl);
scrollChatToBottom();
} else if (item.type === 'text') {
// Intermediate text sent before media items; display it but keep SSE open.
ensureBotEl();
contentEl.classList.remove('sse-streaming');
const textContent = item.content || accumulatedText;
if (textContent) contentEl.innerHTML = renderMarkdown(textContent);
applyHighlighting(botEl);
scrollChatToBottom();
} else if (item.type === 'video') {
ensureBotEl();
const wrapper = document.createElement('div');
wrapper.innerHTML = _buildVideoHtml(item.content);
mediaEl.appendChild(wrapper.firstElementChild || wrapper);
scrollChatToBottom();
} else if (item.type === 'file') {
ensureBotEl();
const fileName = item.file_name || item.content.split('/').pop();
const fileEl = document.createElement('a');
fileEl.href = item.content;
fileEl.download = fileName;
fileEl.target = '_blank';
fileEl.className = 'file-attachment';
fileEl.style.cssText = 'display:inline-flex;align-items:center;gap:6px;padding:8px 14px;margin:8px 0;border-radius:8px;background:var(--bg-secondary,#f3f4f6);color:var(--text-primary,#374151);text-decoration:none;font-size:14px;border:1px solid var(--border-color,#e5e7eb);';
fileEl.innerHTML = `<i class="fas fa-file-download" style="color:#6b7280;"></i> ${fileName}`;
mediaEl.appendChild(fileEl);
scrollChatToBottom();
} else if (item.type === 'phase') {
// Coarse progress (e.g. cow install-browser); must not close SSE (unlike "done")
ensureBotEl();
const wrap = document.createElement('div');
wrap.className = 'text-xs sm:text-sm text-slate-600 dark:text-slate-400 border-l-2 border-primary-400 pl-2 py-1 my-0.5';
wrap.textContent = String(item.content || '');
stepsEl.appendChild(wrap);
scrollChatToBottom();
} else if (item.type === 'done') {
es.onerror = function() {
es.close();
delete activeStreams[requestId];
// item.content may be empty when "done" is only a stream-close signal after media.
const finalText = item.content || accumulatedText;
if (done) return;
if (!botEl && finalText) {
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(finalText, new Date((item.timestamp || Date.now() / 1000) * 1000), requestId);
} else if (botEl) {
if (reconnectCount < MAX_RECONNECTS) {
reconnectCount++;
const delay = Math.min(RECONNECT_BASE_MS * reconnectCount, 5000);
console.warn(`[SSE] connection lost for ${requestId}, reconnecting in ${delay}ms (attempt ${reconnectCount}/${MAX_RECONNECTS})`);
setTimeout(connect, delay);
return;
}
// Exhausted retries, show whatever we have
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
if (!botEl) {
addBotMessage(t('error_send'), new Date());
} else if (accumulatedText) {
contentEl.classList.remove('sse-streaming');
// Only update text content when there is something new to show.
if (finalText) contentEl.innerHTML = renderMarkdown(finalText);
contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
}
scrollChatToBottom();
};
}
} else if (item.type === 'error') {
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
addBotMessage(t('error_send'), new Date());
}
};
es.onerror = function() {
es.close();
delete activeStreams[requestId];
if (loadingEl) { loadingEl.remove(); loadingEl = null; }
if (!botEl) {
addBotMessage(t('error_send'), new Date());
} else if (accumulatedText) {
contentEl.classList.remove('sse-streaming');
contentEl.innerHTML = renderMarkdown(accumulatedText);
applyHighlighting(botEl);
}
};
connect();
}
function startPolling() {

View File

@@ -329,14 +329,18 @@ class WebChannel(ChatChannel):
"""
SSE generator for a given request_id.
Yields UTF-8 encoded bytes to avoid WSGI Latin-1 mangling.
Supports client reconnection: the queue is only removed after a
"done" event is consumed, so a new GET /stream with the same
request_id can resume reading remaining events.
"""
if request_id not in self.sse_queues:
yield b"data: {\"type\": \"error\", \"message\": \"invalid request_id\"}\n\n"
return
q = self.sse_queues[request_id]
timeout = 300 # 5 minutes max
deadline = time.time() + timeout
idle_timeout = 600 # 10 minutes without any real event
deadline = time.time() + idle_timeout
done = False
try:
while time.time() < deadline:
@@ -346,13 +350,18 @@ class WebChannel(ChatChannel):
yield b": keepalive\n\n"
continue
# Real event received, reset idle deadline
deadline = time.time() + idle_timeout
payload = json.dumps(item, ensure_ascii=False)
yield f"data: {payload}\n\n".encode("utf-8")
if item.get("type") == "done":
done = True
break
finally:
self.sse_queues.pop(request_id, None)
if done:
self.sse_queues.pop(request_id, None)
def poll_response(self):
"""