mirror of
https://github.com/zhayujie/chatgpt-on-wechat.git
synced 2026-06-02 00:57:41 +08:00
feat: add skills and upgrade feishu/dingtalk channel
This commit is contained in:
168
skills/openai-image-vision/EXAMPLE.md
Normal file
168
skills/openai-image-vision/EXAMPLE.md
Normal file
@@ -0,0 +1,168 @@
|
||||
# OpenAI Image Vision - Usage Examples
|
||||
|
||||
## Setup
|
||||
|
||||
Set up your API credentials using the agent's env_config tool:
|
||||
|
||||
```bash
|
||||
# Set your OpenAI API key
|
||||
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
|
||||
|
||||
# Optional: Set custom API base URL (for proxy or compatible services)
|
||||
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
|
||||
```
|
||||
|
||||
## Example 1: Analyze a Local Image
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
**Expected Output:**
|
||||
```json
|
||||
{
|
||||
"model": "gpt-4.1-mini",
|
||||
"content": "The image shows a beautiful landscape with mountains in the background and a lake in the foreground. The sky is clear with some clouds, and there are trees along the shoreline.",
|
||||
"usage": {
|
||||
"prompt_tokens": 1234,
|
||||
"completion_tokens": 45,
|
||||
"total_tokens": 1279
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Example 2: Analyze an Image from URL
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image in detail"
|
||||
```
|
||||
|
||||
## Example 3: Extract Text (OCR)
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "document.png" "Extract all text from this image"
|
||||
```
|
||||
|
||||
**Use Case:** Extract text from screenshots, scanned documents, or photos of text.
|
||||
|
||||
## Example 4: Identify Objects
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "scene.jpg" "List all objects you can identify in this image"
|
||||
```
|
||||
|
||||
## Example 5: Analyze Colors and Composition
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "artwork.jpg" "Describe the color palette and composition of this image"
|
||||
```
|
||||
|
||||
## Example 6: Count Items
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "crowd.jpg" "How many people are in this image?"
|
||||
```
|
||||
|
||||
## Example 7: Use Different Models
|
||||
|
||||
```bash
|
||||
# Use gpt-4.1-mini (default, latest mini model)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
|
||||
|
||||
# Use gpt-4.1 (most capable, best for complex analysis)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
|
||||
|
||||
# Use gpt-4o-mini (previous mini model)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
|
||||
```
|
||||
|
||||
## Example 8: Complex Analysis
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "product.jpg" "Analyze this product image. Describe the product, its features, colors, and suggest what kind of marketing copy would work well for it."
|
||||
```
|
||||
|
||||
## Example 9: Safety and Content Moderation
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "content.jpg" "Is there any inappropriate or unsafe content in this image?"
|
||||
```
|
||||
|
||||
## Example 10: Technical Analysis
|
||||
|
||||
```bash
|
||||
bash scripts/vision.sh "diagram.png" "Explain what this technical diagram represents and how it works"
|
||||
```
|
||||
|
||||
## Integration with Agent
|
||||
|
||||
When the agent loads this skill, it will be available in the `<available_skills>` section. The agent can use it like:
|
||||
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "user_uploaded_image.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
The `<base_dir>` will be automatically provided by the skill system.
|
||||
|
||||
## Error Handling Examples
|
||||
|
||||
### Missing API Key
|
||||
```bash
|
||||
$ bash scripts/vision.sh "image.jpg" "What is this?"
|
||||
{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}
|
||||
```
|
||||
|
||||
### File Not Found
|
||||
```bash
|
||||
$ bash scripts/vision.sh "nonexistent.jpg" "What is this?"
|
||||
{"error": "Image file not found", "path": "nonexistent.jpg"}
|
||||
```
|
||||
|
||||
### Unsupported Format
|
||||
```bash
|
||||
$ bash scripts/vision.sh "file.bmp" "What is this?"
|
||||
{"error": "Unsupported image format", "extension": "bmp", "supported": ["jpg", "jpeg", "png", "gif", "webp"]}
|
||||
```
|
||||
|
||||
### Missing Parameters
|
||||
```bash
|
||||
$ bash scripts/vision.sh
|
||||
{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}
|
||||
```
|
||||
|
||||
## Tips for Best Results
|
||||
|
||||
1. **Be Specific**: Ask clear, specific questions about what you want to know
|
||||
2. **Image Quality**: Higher quality images generally produce better results
|
||||
3. **Model Selection**:
|
||||
- Use `gpt-4.1` for complex analysis requiring highest accuracy
|
||||
- Use `gpt-4.1-mini` (default) for most tasks - latest mini model with good balance
|
||||
4. **Text Extraction**: For OCR tasks, ensure text is clearly visible and not too small
|
||||
5. **Multiple Aspects**: You can ask about multiple things in one question
|
||||
6. **Context**: Provide context in your question if needed (e.g., "This is a medical scan, what do you see?")
|
||||
|
||||
## Performance Notes
|
||||
|
||||
- **Local Files**: Automatically base64-encoded, adds ~33% size overhead
|
||||
- **URLs**: Passed directly to API, no encoding overhead
|
||||
- **Timeout**: 60 seconds for API calls
|
||||
- **Max Tokens**: 1000 tokens for responses (configurable in script)
|
||||
- **Rate Limits**: Subject to your OpenAI API plan
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
✅ JPEG (`.jpg`, `.jpeg`)
|
||||
✅ PNG (`.png`)
|
||||
✅ GIF (`.gif`)
|
||||
✅ WebP (`.webp`)
|
||||
|
||||
❌ BMP, TIFF, SVG, and other formats are not supported
|
||||
|
||||
## Cost Considerations
|
||||
|
||||
Vision API calls cost more than text-only calls because they include image tokens. Costs vary by:
|
||||
- Model used (gpt-4.1 vs gpt-4.1-mini)
|
||||
- Image size and resolution
|
||||
- Length of response
|
||||
|
||||
Check OpenAI's pricing page for current rates: https://openai.com/pricing
|
||||
178
skills/openai-image-vision/README.md
Normal file
178
skills/openai-image-vision/README.md
Normal file
@@ -0,0 +1,178 @@
|
||||
# OpenAI Image Vision Skill
|
||||
|
||||
This skill enables image analysis using OpenAI's Vision API (GPT-4 Vision models).
|
||||
|
||||
## Features
|
||||
|
||||
- ✅ Analyze images from local files or URLs
|
||||
- ✅ Support for multiple image formats (JPEG, PNG, GIF, WebP)
|
||||
- ✅ Automatic base64 encoding for local files
|
||||
- ✅ Direct URL passing for remote images
|
||||
- ✅ Configurable model selection
|
||||
- ✅ Custom API base URL support
|
||||
- ✅ Pure bash/curl implementation (no Python dependencies)
|
||||
|
||||
## Quick Start
|
||||
|
||||
1. **Set up API credentials using env_config:**
|
||||
```bash
|
||||
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
|
||||
# Optional: custom API base
|
||||
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
|
||||
```
|
||||
|
||||
2. **Analyze an image:**
|
||||
```bash
|
||||
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
3. **Analyze from URL:**
|
||||
```bash
|
||||
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
|
||||
```
|
||||
```bash
|
||||
bash scripts/vision.sh "/path/to/image.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
3. **Analyze from URL:**
|
||||
```bash
|
||||
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
|
||||
```
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic image analysis
|
||||
```bash
|
||||
bash scripts/vision.sh "photo.jpg" "What objects can you see?"
|
||||
```
|
||||
|
||||
### Text extraction (OCR)
|
||||
```bash
|
||||
bash scripts/vision.sh "document.png" "Extract all text from this image"
|
||||
```
|
||||
|
||||
### Detailed description
|
||||
```bash
|
||||
bash scripts/vision.sh "scene.jpg" "Describe this scene in detail, including colors, mood, and composition"
|
||||
```
|
||||
|
||||
### Using different models
|
||||
```bash
|
||||
# Use gpt-4.1-mini (default, latest mini model)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
|
||||
|
||||
# Use gpt-4.1 (most capable, latest model)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
|
||||
|
||||
# Use gpt-4o-mini (previous mini model)
|
||||
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Required | Default | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `OPENAI_API_KEY` | Yes | - | Your OpenAI API key |
|
||||
| `OPENAI_API_BASE` | No | `https://api.openai.com/v1` | Custom API base URL |
|
||||
|
||||
## Response Format
|
||||
|
||||
Success response:
|
||||
```json
|
||||
{
|
||||
"model": "gpt-4.1-mini",
|
||||
"content": "The image shows a beautiful sunset over mountains...",
|
||||
"usage": {
|
||||
"prompt_tokens": 1234,
|
||||
"completion_tokens": 567,
|
||||
"total_tokens": 1801
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Error response:
|
||||
```json
|
||||
{
|
||||
"error": "Error description",
|
||||
"details": "Additional information"
|
||||
}
|
||||
```
|
||||
|
||||
## Supported Models
|
||||
|
||||
- `gpt-4.1-mini` (default) - Latest mini model, fast and cost-effective
|
||||
- `gpt-4.1` - Latest GPT-4 variant, most capable
|
||||
- `gpt-4o-mini` - Previous generation mini model
|
||||
- `gpt-4-turbo` - Previous generation turbo model
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
- JPEG (`.jpg`, `.jpeg`)
|
||||
- PNG (`.png`)
|
||||
- GIF (`.gif`)
|
||||
- WebP (`.webp`)
|
||||
|
||||
## Technical Details
|
||||
|
||||
- **Implementation**: Pure bash script using curl and base64
|
||||
- **Timeout**: 60 seconds for API calls
|
||||
- **Max tokens**: 1000 tokens for responses
|
||||
- **Image handling**:
|
||||
- Local files are automatically base64-encoded
|
||||
- URLs are passed directly to the API
|
||||
- MIME types are auto-detected from file extensions
|
||||
|
||||
## Error Handling
|
||||
|
||||
The script handles various error cases:
|
||||
- Missing required parameters
|
||||
- Missing API key
|
||||
- File not found
|
||||
- Unsupported image formats
|
||||
- API errors
|
||||
- Network timeouts
|
||||
- Invalid JSON responses
|
||||
|
||||
## Integration with Agent System
|
||||
|
||||
When loaded by the agent system, this skill will appear in `<available_skills>` with a `<base_dir>` path. Use it like:
|
||||
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "image.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
The agent will automatically:
|
||||
- Load environment variables from `~/.cow/.env`
|
||||
- Provide the correct `<base_dir>` path
|
||||
- Handle skill discovery and registration
|
||||
|
||||
## Notes
|
||||
|
||||
- Images are sent to OpenAI's servers for processing
|
||||
- Large images may be automatically resized by the API
|
||||
- Rate limits depend on your OpenAI API plan
|
||||
- Token usage includes both the image and text in the prompt
|
||||
- Base64 encoding increases the size of local images by ~33%
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**"OPENAI_API_KEY environment variable is not set"**
|
||||
- Set the environment variable using env_config tool
|
||||
- Or use the agent's env_config tool
|
||||
|
||||
**"Image file not found"**
|
||||
- Check the file path is correct
|
||||
- Use absolute paths or paths relative to current directory
|
||||
|
||||
**"Unsupported image format"**
|
||||
- Only JPEG, PNG, GIF, and WebP are supported
|
||||
- Check the file extension matches the actual format
|
||||
|
||||
**"Failed to call OpenAI API"**
|
||||
- Check your internet connection
|
||||
- Verify the API key is valid
|
||||
- Check if custom API base URL is correct
|
||||
|
||||
## License
|
||||
|
||||
Part of the chatgpt-on-wechat project.
|
||||
119
skills/openai-image-vision/SKILL.md
Normal file
119
skills/openai-image-vision/SKILL.md
Normal file
@@ -0,0 +1,119 @@
|
||||
---
|
||||
name: openai-image-vision
|
||||
description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash <base_dir>/scripts/vision.sh <image> <question>'. Can understand image content, objects, text, colors, and answer questions about images.
|
||||
homepage: https://platform.openai.com/docs/guides/vision
|
||||
metadata:
|
||||
emoji: 👁️
|
||||
requires:
|
||||
bins: ["curl", "base64"]
|
||||
env: ["OPENAI_API_KEY"]
|
||||
primaryEnv: "OPENAI_API_KEY"
|
||||
---
|
||||
|
||||
# OpenAI Image Vision
|
||||
|
||||
Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images.
|
||||
|
||||
## Setup
|
||||
|
||||
This skill requires an OpenAI API key. If not configured:
|
||||
|
||||
1. Get your API key from https://platform.openai.com/api-keys
|
||||
2. Set the key using: `env_config(action="set", key="OPENAI_API_KEY", value="your-key")`
|
||||
|
||||
Optional: Set custom API base URL (default: https://api.openai.com/v1):
|
||||
```bash
|
||||
env_config(action="set", key="OPENAI_API_BASE", value="your-base-url")
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
**Important**: Scripts are located relative to this skill's base directory.
|
||||
|
||||
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
|
||||
|
||||
**CRITICAL**: Always use `bash` command to execute the script:
|
||||
|
||||
```bash
|
||||
# General pattern (MUST start with bash):
|
||||
bash "<base_dir>/scripts/vision.sh" "<image_path_or_url>" "<question>" [model]
|
||||
|
||||
# DO NOT execute the script directly like this (WRONG):
|
||||
# "<base_dir>/scripts/vision.sh" ...
|
||||
|
||||
# Parameters:
|
||||
# - image_path_or_url: Local image file path or HTTP(S) URL (required)
|
||||
# - question: Question to ask about the image (required)
|
||||
# - model: OpenAI model to use (default: gpt-4.1-mini)
|
||||
# Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Analyze a local image
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?"
|
||||
```
|
||||
|
||||
### Analyze an image from URL
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail"
|
||||
```
|
||||
|
||||
### Use specific model
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini"
|
||||
```
|
||||
|
||||
### Extract text from image
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image"
|
||||
```
|
||||
|
||||
### Analyze multiple aspects
|
||||
```bash
|
||||
bash "<base_dir>/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene"
|
||||
```
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
- JPEG (.jpg, .jpeg)
|
||||
- PNG (.png)
|
||||
- GIF (.gif)
|
||||
- WebP (.webp)
|
||||
|
||||
**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality.
|
||||
|
||||
## Response Format
|
||||
|
||||
The script returns a JSON response:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "gpt-4.1-mini",
|
||||
"content": "The image shows...",
|
||||
"usage": {
|
||||
"prompt_tokens": 1234,
|
||||
"completion_tokens": 567,
|
||||
"total_tokens": 1801
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Or in case of error:
|
||||
|
||||
```json
|
||||
{
|
||||
"error": "Error description",
|
||||
"details": "Additional error information"
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- **Image size**: Images are automatically resized if too large
|
||||
- **Timeout**: 60 seconds for API calls
|
||||
- **Rate limits**: Subject to your OpenAI API plan limits
|
||||
- **Privacy**: Images are sent to OpenAI's servers for processing
|
||||
- **Local files**: Automatically converted to base64 for API submission
|
||||
- **URLs**: Can be passed directly to the API without downloading
|
||||
233
skills/openai-image-vision/scripts/vision.sh
Executable file
233
skills/openai-image-vision/scripts/vision.sh
Executable file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env bash
|
||||
# OpenAI Vision API wrapper
|
||||
# API Docs: https://platform.openai.com/docs/guides/vision
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
image_input="${1:-}"
|
||||
question="${2:-}"
|
||||
model="${3:-gpt-4.1-mini}"
|
||||
|
||||
if [ -z "$image_input" ]; then
|
||||
echo '{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$question" ]; then
|
||||
echo '{"error": "Question is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
||||
echo '{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Set API base URL (default to OpenAI's official endpoint)
|
||||
api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}"
|
||||
# Remove trailing slash if present
|
||||
api_base="${api_base%/}"
|
||||
|
||||
# Determine if input is a URL or local file
|
||||
if [[ "$image_input" =~ ^https?:// ]]; then
|
||||
# It's a URL - use it directly
|
||||
image_url="$image_input"
|
||||
|
||||
# Build JSON request body with URL
|
||||
request_body=$(cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "$question"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "$image_url"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 1000
|
||||
}
|
||||
EOF
|
||||
)
|
||||
else
|
||||
# It's a local file - need to encode as base64
|
||||
if [ ! -f "$image_input" ]; then
|
||||
echo "{\"error\": \"Image file not found\", \"path\": \"$image_input\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check file size and compress if needed to avoid "Argument list too long" error
|
||||
# Files larger than 1MB should be compressed
|
||||
file_size=$(wc -c < "$image_input" | tr -d ' ')
|
||||
max_size=1048576 # 1MB
|
||||
|
||||
image_to_encode="$image_input"
|
||||
temp_compressed=""
|
||||
|
||||
if [ "$file_size" -gt "$max_size" ]; then
|
||||
# File is too large, compress it
|
||||
temp_compressed=$(mktemp "${TMPDIR:-/tmp}/vision_compressed_XXXXXX.jpg")
|
||||
|
||||
# Use sips (macOS) or convert (ImageMagick) to compress
|
||||
if command -v sips &> /dev/null; then
|
||||
# macOS: resize to max 800px on longest side
|
||||
sips -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
image_to_encode="$temp_compressed"
|
||||
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
|
||||
fi
|
||||
elif command -v convert &> /dev/null; then
|
||||
# Linux: use ImageMagick
|
||||
convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null
|
||||
if [ $? -eq 0 ]; then
|
||||
image_to_encode="$temp_compressed"
|
||||
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Detect image format from file extension
|
||||
extension="${image_to_encode##*.}"
|
||||
extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
case "$extension_lower" in
|
||||
jpg|jpeg)
|
||||
mime_type="image/jpeg"
|
||||
;;
|
||||
png)
|
||||
mime_type="image/png"
|
||||
;;
|
||||
gif)
|
||||
mime_type="image/gif"
|
||||
;;
|
||||
webp)
|
||||
mime_type="image/webp"
|
||||
;;
|
||||
*)
|
||||
echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}"
|
||||
# Clean up temp file if exists
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Encode image to base64
|
||||
if command -v base64 &> /dev/null; then
|
||||
# macOS and most Linux systems
|
||||
base64_image=$(base64 -i "$image_to_encode" 2>/dev/null || base64 "$image_to_encode" 2>/dev/null)
|
||||
else
|
||||
echo '{"error": "base64 command not found", "help": "Please install base64 utility"}'
|
||||
# Clean up temp file if exists
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean up temp compressed file
|
||||
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
|
||||
|
||||
if [ -z "$base64_image" ]; then
|
||||
echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Escape question for JSON (replace " with \")
|
||||
escaped_question=$(echo "$question" | sed 's/"/\\"/g')
|
||||
|
||||
# Build JSON request body with base64 image
|
||||
# Note: Using printf to avoid issues with special characters
|
||||
request_body=$(cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "$escaped_question"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "data:$mime_type;base64,$base64_image"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 1000
|
||||
}
|
||||
EOF
|
||||
)
|
||||
fi
|
||||
|
||||
# Call OpenAI API
|
||||
response=$(curl -sS --max-time 60 \
|
||||
-X POST \
|
||||
-H "Authorization: Bearer $OPENAI_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$request_body" \
|
||||
"$api_base/chat/completions" 2>&1)
|
||||
|
||||
curl_exit_code=$?
|
||||
|
||||
if [ $curl_exit_code -ne 0 ]; then
|
||||
echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Simple JSON validation - check if response starts with { or [
|
||||
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
|
||||
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check for API error (look for "error" field in response)
|
||||
if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then
|
||||
# Extract error message if possible
|
||||
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
|
||||
if [ -z "$error_msg" ]; then
|
||||
error_msg="Unknown API error"
|
||||
fi
|
||||
echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract the content from the response
|
||||
# The response structure is: choices[0].message.content
|
||||
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
|
||||
|
||||
# Extract usage information
|
||||
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
|
||||
|
||||
# Build simplified response
|
||||
if [ -n "$content" ]; then
|
||||
# Unescape JSON content (basic unescaping)
|
||||
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
|
||||
|
||||
cat <<EOF
|
||||
{
|
||||
"model": "$model",
|
||||
"content": "$content",
|
||||
"usage": {
|
||||
"prompt_tokens": ${prompt_tokens:-0},
|
||||
"completion_tokens": ${completion_tokens:-0},
|
||||
"total_tokens": ${total_tokens:-0}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
else
|
||||
# If we can't extract content, return the full response
|
||||
echo "$response"
|
||||
fi
|
||||
Reference in New Issue
Block a user