From 14a119c48ce9bc4c602a228eee3fb18d60971a1d Mon Sep 17 00:00:00 2001
From: zhayujie <yjzha1996@163.com>
Date: Sat, 18 Apr 2026 21:18:27 +0800
Subject: [PATCH] fix(gemini): solving the problem of tool call not returnings

---
 agent/protocol/agent_stream.py     | 12 ++++++++
 channel/web/static/js/console.js   | 34 +++++++++++++++++++++-
 models/gemini/google_gemini_bot.py | 46 +++++++++++++++++++++++++++++-
 3 files changed, 90 insertions(+), 2 deletions(-)
diff --git a/agent/protocol/agent_stream.py b/agent/protocol/agent_stream.py
index d7fc5066..3e6bc4e4 100644
--- a/agent/protocol/agent_stream.py
+++ b/agent/protocol/agent_stream.py
@@ -241,6 +241,9 @@ class AgentStreamExecutor:
                         if turn > 1:
                             logger.info(f"[Agent] Requesting explicit response from LLM...")
                             
+                            # Remember position so we can remove the injected prompt later
+                            prompt_insert_idx = len(self.messages)
+                            
                             # 添加一条消息，明确要求回复用户
                             self.messages.append({
                                 "role": "user",
@@ -254,6 +257,15 @@ class AgentStreamExecutor:
                             assistant_msg, tool_calls = self._call_llm_stream(retry_on_empty=False)
                             final_response = assistant_msg
                             
+                            # Remove the injected prompt from history so it doesn't
+                            # appear as a user message in persisted conversations.
+                            # _call_llm_stream may have appended an assistant message
+                            # after the prompt, so we locate and remove only the prompt.
+                            if (prompt_insert_idx < len(self.messages)
+                                    and self.messages[prompt_insert_idx].get("role") == "user"):
+                                self.messages.pop(prompt_insert_idx)
+                                logger.debug("[Agent] Removed injected explicit-response prompt from message history")
+                            
                             # If LLM responded with tool_calls instead of text, fall through
                             # to the tool execution path below (don't break the loop).
                             if tool_calls:
diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js
index 19d2c940..099ef32a 100644
--- a/channel/web/static/js/console.js
+++ b/channel/web/static/js/console.js
@@ -339,6 +339,7 @@ function createMd() {
 const md = createMd();
 
 const VIDEO_EXT_RE = /\.(?:mp4|webm|mov|avi|mkv)$/i;  // tested against URL without query string
+const IMAGE_EXT_RE = /\.(?:jpg|jpeg|png|gif|webp|bmp|svg)$/i;  // tested against URL without query string
 
 function _buildVideoHtml(url) {
     const fileName = url.split('/').pop().split('?')[0];
@@ -351,6 +352,15 @@ function _buildVideoHtml(url) {
         `<i class="fas fa-download"></i> ${escapeHtml(fileName)}</a></div>`;
 }
 
+function _buildImageHtml(url) {
+    const safeUrl = url.replace(/"/g, '&quot;');
+    return `<div style="margin:10px 0;">` +
+        `<img src="${safeUrl}" alt="image" loading="lazy" ` +
+        `onclick="window.open('${safeUrl}','_blank')" ` +
+        `style="max-width:600px;width:100%;border-radius:10px;box-shadow:0 2px 8px rgba(0,0,0,0.15);display:block;cursor:pointer;">` +
+        `</div>`;
+}
+
 function injectVideoPlayers(html) {
     // Step 1: replace markdown-it anchor tags whose href points to a video file.
     const step1 = html.replace(
@@ -369,10 +379,32 @@ function injectVideoPlayers(html) {
     }).join('');
 }
 
+// Convert image URLs into inline <img> previews. Mirrors injectVideoPlayers but for images.
+// Handles three cases produced by markdown-it:
+//   1. <a href="...image.jpg">...</a>  (bare URL or autolink that linkify turned into an anchor)
+//   2. <img src="...">                  (markdown image syntax) — leave as-is, but normalize style
+//   3. raw URL still present in a text node                    — only as a safety net
+function injectImagePreviews(html) {
+    // Step 1: anchor whose href points to an image file -> replace with <img> preview.
+    const step1 = html.replace(
+        /<a\s+href="(https?:\/\/[^"]+)"[^>]*>[^<]*<\/a>/gi,
+        (match, url) => IMAGE_EXT_RE.test(url.split('?')[0]) ? _buildImageHtml(url) : match
+    );
+    // Step 2: bare image URLs left in text nodes (rare — markdown-it's linkify usually catches them).
+    return step1.split(/(<[^>]+>)/).map((chunk, idx) => {
+        if (idx % 2 !== 0) return chunk;
+        return chunk.replace(/https?:\/\/\S+/gi, (url) => {
+            const bare = url.replace(/[),.\s]+$/, '');
+            return IMAGE_EXT_RE.test(bare.split('?')[0]) ? _buildImageHtml(bare) : url;
+        });
+    }).join('');
+}
+
 function renderMarkdown(text) {
     try {
         const html = md.render(text);
-        return injectVideoPlayers(html);
+        // Order matters: video first (more specific), then image.
+        return injectImagePreviews(injectVideoPlayers(html));
     }
     catch (e) { return text.replace(/\n/g, '<br>'); }
 }
diff --git a/models/gemini/google_gemini_bot.py b/models/gemini/google_gemini_bot.py
index e379d912..ab3eae4c 100644
--- a/models/gemini/google_gemini_bot.py
+++ b/models/gemini/google_gemini_bot.py
@@ -335,6 +335,18 @@ class GoogleGeminiBot(Bot):
                 # Convert role
                 gemini_role = "user" if role in ["user", "tool"] else "model"
                 
+                # For model messages that carry original Gemini parts (with
+                # thoughtSignature etc.), use them directly instead of
+                # reconstructing from Claude-format tool_use blocks.
+                if gemini_role == "model" and "_gemini_raw_parts" in msg:
+                    raw_parts = msg["_gemini_raw_parts"]
+                    if raw_parts:
+                        payload["contents"].append({
+                            "role": "model",
+                            "parts": raw_parts
+                        })
+                        continue
+                
                 # Handle different content formats
                 parts = []
                 
@@ -398,6 +410,17 @@ class GoogleGeminiBot(Bot):
                             else:
                                 logger.warning(f"[Gemini] Skip invalid image block: {str(block)[:200]}")
                             
+                        elif block_type == "tool_use":
+                            # Convert Claude tool_use to Gemini functionCall
+                            fc_name = block.get("name", "unknown")
+                            fc_args = block.get("input") or {}
+                            parts.append({
+                                "functionCall": {
+                                    "name": fc_name,
+                                    "args": fc_args
+                                }
+                            })
+
                         elif block_type == "tool_result":
                             # Convert Claude tool_result to Gemini functionResponse
                             tool_use_id = block.get("tool_use_id")
@@ -648,6 +671,7 @@ class GoogleGeminiBot(Bot):
         """Handle Gemini REST API stream response"""
         try:
             all_tool_calls = []
+            all_raw_parts = []  # Preserve all Gemini parts (incl. thoughtSignature) for round-trip
             has_sent_tool_calls = False
             has_content = False  # Track if any content was sent
             chunk_count = 0
@@ -733,6 +757,9 @@ class GoogleGeminiBot(Bot):
                                     "arguments": json.dumps(fc.get("args", {}))
                                 }
                             })
+
+                    # Preserve all raw parts for round-trip (thoughtSignature, etc.)
+                    all_raw_parts.extend(parts)
                     
                 except json.JSONDecodeError as je:
                     logger.debug(f"[Gemini] JSON decode error: {je}, line={line[:500]}")
@@ -740,6 +767,9 @@ class GoogleGeminiBot(Bot):
             
             # Send tool calls if any were collected
             if all_tool_calls and not has_sent_tool_calls:
+                delta = {"tool_calls": all_tool_calls}
+                if all_raw_parts:
+                    delta["_gemini_raw_parts"] = all_raw_parts
                 yield {
                     "id": f"chatcmpl-{time.time()}",
                     "object": "chat.completion.chunk",
@@ -747,11 +777,25 @@ class GoogleGeminiBot(Bot):
                     "model": model_name,
                     "choices": [{
                         "index": 0,
-                        "delta": {"tool_calls": all_tool_calls},
+                        "delta": delta,
                         "finish_reason": None
                     }]
                 }
                 has_sent_tool_calls = True
+            elif not has_sent_tool_calls and all_raw_parts:
+                # No tool calls but we have raw parts (e.g. text-only response with
+                # thoughtSignature) — pass them through for round-trip fidelity.
+                yield {
+                    "id": f"chatcmpl-{time.time()}",
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": model_name,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {"_gemini_raw_parts": all_raw_parts},
+                        "finish_reason": None
+                    }]
+                }
             
             # 如果返回空响应，dump 完整原始 chunks 以便诊断
             if not has_content and not all_tool_calls: