#!/usr/bin/env bash # OpenAI Vision API wrapper # API Docs: https://platform.openai.com/docs/guides/vision set -euo pipefail image_input="${1:-}" question="${2:-}" model="${3:-gpt-4.1-mini}" if [ -z "$image_input" ]; then echo '{"error": "Image path or URL is required", "usage": "bash vision.sh [model]"}' exit 1 fi if [ -z "$question" ]; then echo '{"error": "Question is required", "usage": "bash vision.sh [model]"}' exit 1 fi if [ -z "${OPENAI_API_KEY:-}" ]; then echo '{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}' exit 1 fi # Set API base URL (default to OpenAI's official endpoint) api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}" # Remove trailing slash if present api_base="${api_base%/}" # Determine if input is a URL or local file if [[ "$image_input" =~ ^https?:// ]]; then # It's a URL - use it directly image_url="$image_input" # Build JSON request body with URL request_body=$(cat < /dev/null; then # macOS: resize to max 800px on longest side $(command -v sips) -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null if [ $? -eq 0 ]; then image_to_encode="$temp_compressed" >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit" fi elif command -v convert &> /dev/null; then # Linux: use ImageMagick convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null if [ $? -eq 0 ]; then image_to_encode="$temp_compressed" >&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit" fi fi fi # Detect image format from file extension extension="${image_to_encode##*.}" extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]') case "$extension_lower" in jpg|jpeg) mime_type="image/jpeg" ;; png) mime_type="image/png" ;; gif) mime_type="image/gif" ;; webp) mime_type="image/webp" ;; *) echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}" # Clean up temp file if exists [ -n "$temp_compressed" ] && rm -f "$temp_compressed" exit 1 ;; esac # Encode image to base64 if command -v base64 &> /dev/null; then # macOS and most Linux systems base64_cmd=$(command -v base64) base64_image=$($base64_cmd -i "$image_to_encode" 2>/dev/null || $base64_cmd "$image_to_encode" 2>/dev/null) else echo '{"error": "base64 command not found", "help": "Please install base64 utility"}' # Clean up temp file if exists [ -n "$temp_compressed" ] && rm -f "$temp_compressed" exit 1 fi # Clean up temp compressed file [ -n "$temp_compressed" ] && rm -f "$temp_compressed" if [ -z "$base64_image" ]; then echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}" exit 1 fi # Escape question for JSON (replace " with \") escaped_question=$(echo "$question" | sed 's/"/\\"/g') # Build JSON request body with base64 image # Note: Using printf to avoid issues with special characters request_body=$(cat <&1) curl_exit_code=$? if [ $curl_exit_code -ne 0 ]; then echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}" exit 1 fi # Simple JSON validation - check if response starts with { or [ if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}" exit 1 fi # Check for API error (look for "error" field in response) if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then # Extract error message if possible error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1) if [ -z "$error_msg" ]; then error_msg="Unknown API error" fi echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}" exit 1 fi # Extract the content from the response # The response structure is: choices[0].message.content content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1) # Extract usage information prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1) # Build simplified response if [ -n "$content" ]; then # Unescape JSON content (basic unescaping) content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g') cat <