chatgpt-on-wechat/agent/memory/storage.py

"""
Storage layer for memory using SQLite + FTS5

Provides vector and keyword search capabilities
"""

from __future__ import annotations
import re
import sqlite3
import json
import hashlib
import threading
from typing import List, Dict, Optional, Any
from pathlib import Path
from dataclasses import dataclass
try:
    import numpy as np
    _HAS_NUMPY = True
except ImportError:
    _HAS_NUMPY = False
    np = None  # type: ignore[assignment]

# UPSERT (INSERT … ON CONFLICT DO UPDATE) requires SQLite ≥ 3.24.0 (2018).
# Older systems (e.g. CentOS 7 ships SQLite 3.7) fall back to INSERT OR REPLACE,
# which risks FTS5 rowid drift on chunk updates (see save_chunk docstring).
_HAS_UPSERT = sqlite3.sqlite_version_info >= (3, 24, 0)

# ---------------------------------------------------------------------------
# CJK character ranges, compiled once at module load.
# Covers: CJK Symbols/Punctuation, Japanese kana (hiragana + katakana),
#         CJK Unified Ideographs + Extension A, Korean syllables (Hangul),
#         CJK Compatibility Ideographs, and CJK Extension B–F.
# ---------------------------------------------------------------------------
_CJK_RANGES = (
    r'\u3000-\u30ff'          # CJK Symbols/Punctuation + Japanese kana
    r'\u3400-\u9fff'          # CJK Unified Ideographs (incl. Extension A)
    r'\uac00-\ud7af'          # Korean syllables (Hangul)
    r'\uf900-\ufaff'          # CJK Compatibility Ideographs
    r'\U00020000-\U0002fa1f'  # CJK Extension B–F
)
_RE_CONTAINS_CJK   = re.compile(f'[{_CJK_RANGES}]')
_RE_CJK_WORDS      = re.compile(f'[{_CJK_RANGES}]+')
_RE_TRIGRAM_TOKENS = re.compile(f'[{_CJK_RANGES}]+|[A-Za-z0-9_]+')


@dataclass
class MemoryChunk:
    """Represents a memory chunk with text and embedding"""
    id: str
    user_id: Optional[str]
    scope: str  # "shared" | "user" | "session"
    source: str  # "memory" | "session"
    path: str
    start_line: int
    end_line: int
    text: str
    embedding: Optional[List[float]]
    hash: str
    metadata: Optional[Dict[str, Any]] = None


@dataclass
class SearchResult:
    """Search result with score and snippet"""
    path: str
    start_line: int
    end_line: int
    score: float
    snippet: str
    source: str
    user_id: Optional[str] = None


class MemoryStorage:
    """SQLite-based storage with FTS5 for keyword search"""

    def __init__(self, db_path: Path):
        self.db_path = db_path
        self.conn: Optional[sqlite3.Connection] = None
        self.fts5_available = False  # Track FTS5 availability
        # RLock protects concurrent writes from the same process.
        # SQLite WAL mode handles read/write concurrency at the file level,
        # but same-process concurrent writes still need a Python-level lock.
        self._lock = threading.RLock()
        self._init_db()

    def _check_fts5_support(self) -> bool:
        """Check if SQLite has FTS5 support"""
        try:
            self.conn.execute("CREATE VIRTUAL TABLE IF NOT EXISTS fts5_test USING fts5(test)")
            self.conn.execute("DROP TABLE IF EXISTS fts5_test")
            return True
        except sqlite3.OperationalError as e:
            if "no such module: fts5" in str(e):
                return False
            raise

    def _init_db(self):
        """Initialize database with schema"""
        try:
            self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
            self.conn.row_factory = sqlite3.Row

            # Check FTS5 support
            self.fts5_available = self._check_fts5_support()
            if not _HAS_UPSERT:
                from common.log import logger
                logger.warning(
                    "[MemoryStorage] SQLite %s < 3.24 — UPSERT unavailable. "
                    "Falling back to INSERT OR REPLACE; FTS5 rowid may drift on "
                    "chunk updates (rebuild index periodically to recover).",
                    sqlite3.sqlite_version,
                )
            if not self.fts5_available:
                from common.log import logger
                logger.debug("[MemoryStorage] FTS5 not available, using LIKE-based keyword search")

            # Check database integrity
            try:
                result = self.conn.execute("PRAGMA integrity_check").fetchone()
                if result[0] != 'ok':
                    print(f"⚠️  Database integrity check failed: {result[0]}")
                    print(f"   Recreating database...")
                    self.conn.close()
                    self.conn = None
                    # Remove corrupted database
                    self.db_path.unlink(missing_ok=True)
                    # Remove WAL files
                    Path(str(self.db_path) + '-wal').unlink(missing_ok=True)
                    Path(str(self.db_path) + '-shm').unlink(missing_ok=True)
                    # Reconnect to create new database
                    self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
                    self.conn.row_factory = sqlite3.Row
            except sqlite3.DatabaseError:
                # Database is corrupted, recreate it
                print(f"⚠️  Database is corrupted, recreating...")
                if self.conn:
                    self.conn.close()
                    self.conn = None
                self.db_path.unlink(missing_ok=True)
                Path(str(self.db_path) + '-wal').unlink(missing_ok=True)
                Path(str(self.db_path) + '-shm').unlink(missing_ok=True)
                self.conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
                self.conn.row_factory = sqlite3.Row

            # Enable WAL mode for better concurrency
            self.conn.execute("PRAGMA journal_mode=WAL")
            # Set busy timeout to avoid "database is locked" errors
            self.conn.execute("PRAGMA busy_timeout=5000")
        except Exception as e:
            print(f"⚠️  Unexpected error during database initialization: {e}")
            raise

        # Create chunks table with embeddings
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS chunks (
                id TEXT PRIMARY KEY,
                user_id TEXT,
                scope TEXT NOT NULL DEFAULT 'shared',
                source TEXT NOT NULL DEFAULT 'memory',
                path TEXT NOT NULL,
                start_line INTEGER NOT NULL,
                end_line INTEGER NOT NULL,
                text TEXT NOT NULL,
                embedding TEXT,
                hash TEXT NOT NULL,
                metadata TEXT,
                created_at INTEGER DEFAULT (strftime('%s', 'now')),
                updated_at INTEGER DEFAULT (strftime('%s', 'now'))
            )
        """)

        # Create indexes
        self.conn.execute("""
            CREATE INDEX IF NOT EXISTS idx_chunks_user
            ON chunks(user_id)
        """)

        self.conn.execute("""
            CREATE INDEX IF NOT EXISTS idx_chunks_scope
            ON chunks(scope)
        """)

        self.conn.execute("""
            CREATE INDEX IF NOT EXISTS idx_chunks_hash
            ON chunks(path, hash)
        """)

        # Create FTS5 virtual table + triggers (only if supported).
        # Self-heal: if the previous process crashed mid-rebuild and left
        # triggers pointing at a missing chunks_fts (or vice versa), wipe
        # both sides and recreate cleanly. Otherwise next chunks INSERT
        # will fail with "no such table: chunks_fts".
        if self.fts5_available:
            if self._fts5_state_inconsistent():
                from common.log import logger
                logger.warning(
                    "[MemoryStorage] FTS5 state inconsistent (triggers/table mismatch). "
                    "Resetting chunks_fts to recover."
                )
                self.conn.execute("DROP TRIGGER IF EXISTS chunks_ai")
                self.conn.execute("DROP TRIGGER IF EXISTS chunks_ad")
                self.conn.execute("DROP TRIGGER IF EXISTS chunks_au")
                self.conn.execute("DROP TABLE IF EXISTS chunks_fts")
                self.conn.commit()
            self._create_fts5_objects()

            # Probe FTS5 shadow tables. The schema may be intact but the
            # internal _data/_idx/_docsize blob can still be corrupt — that
            # surfaces as "database disk image is malformed" on bm25 / MATCH.
            # We rebuild from the chunks table when that happens; data isn't
            # lost because chunks (the content table) is the source of truth.
            if self._fts5_shadow_corrupt():
                from common.log import logger
                logger.warning(
                    "[MemoryStorage] FTS5 shadow tables corrupt; rebuilding from chunks."
                )
                self._rebuild_fts5_from_chunks()

        # Internal key-value store for persistent flags (e.g. backfill tracking)
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS _meta (
                key TEXT PRIMARY KEY,
                value TEXT NOT NULL
            )
        """)

        # Create trigram FTS5 table for CJK / mixed-language search
        self.trigram_fts5_available = False
        if self.fts5_available:
            try:
                self.conn.execute("""
                    CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts_trigram USING fts5(
                        text,
                        id UNINDEXED,
                        user_id UNINDEXED,
                        path UNINDEXED,
                        source UNINDEXED,
                        scope UNINDEXED,
                        content='chunks',
                        content_rowid='rowid',
                        tokenize='trigram case_sensitive 0'
                    )
                """)
                self.conn.execute("""
                    CREATE TRIGGER IF NOT EXISTS chunks_trigram_ai
                    AFTER INSERT ON chunks BEGIN
                        INSERT INTO chunks_fts_trigram(rowid, text, id, user_id, path, source, scope)
                        VALUES (new.rowid, new.text, new.id, new.user_id, new.path, new.source, new.scope);
                    END
                """)
                self.conn.execute("""
                    CREATE TRIGGER IF NOT EXISTS chunks_trigram_ad
                    AFTER DELETE ON chunks BEGIN
                        DELETE FROM chunks_fts_trigram WHERE rowid = old.rowid;
                    END
                """)
                self.conn.execute("""
                    CREATE TRIGGER IF NOT EXISTS chunks_trigram_au
                    AFTER UPDATE ON chunks BEGIN
                        UPDATE chunks_fts_trigram
                        SET text=new.text, id=new.id, user_id=new.user_id,
                            path=new.path, source=new.source, scope=new.scope
                        WHERE rowid = new.rowid;
                    END
                """)
                # One-time backfill for existing rows.
                # NOTE: COUNT(*) on an FTS5 content table always returns 0, so we
                # use a persistent flag in _meta instead of counting trigram rows.
                backfill_done = self.conn.execute(
                    "SELECT 1 FROM _meta WHERE key = 'trigram_backfill_done'"
                ).fetchone()
                chunks_count = self.conn.execute(
                    "SELECT COUNT(*) as c FROM chunks"
                ).fetchone()['c']
                if chunks_count > 0 and not backfill_done:
                    self.conn.execute(
                        "INSERT INTO chunks_fts_trigram(chunks_fts_trigram) VALUES('rebuild')"
                    )
                    self.conn.execute(
                        "INSERT OR REPLACE INTO _meta(key, value) VALUES('trigram_backfill_done', '1')"
                    )
                self.trigram_fts5_available = True
            except Exception:
                from common.log import logger
                logger.warning("[MemoryStorage] trigram FTS5 unavailable, CJK search will use LIKE fallback", exc_info=True)
                self.trigram_fts5_available = False

        # Create files metadata table
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS files (
                path TEXT PRIMARY KEY,
                source TEXT NOT NULL DEFAULT 'memory',
                hash TEXT NOT NULL,
                mtime INTEGER NOT NULL,
                size INTEGER NOT NULL,
                updated_at INTEGER DEFAULT (strftime('%s', 'now'))
            )
        """)

        self.conn.commit()

    def _fts5_state_inconsistent(self) -> bool:
        """Detect a half-broken FTS5 setup (e.g. trigger exists but table doesn't)."""
        try:
            row = self.conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_fts'"
            ).fetchone()
            table_exists = row is not None
            row = self.conn.execute(
                "SELECT COUNT(*) FROM sqlite_master WHERE type='trigger' "
                "AND name IN ('chunks_ai','chunks_ad','chunks_au')"
            ).fetchone()
            trigger_count = int(row[0]) if row else 0
        except Exception:
            return False
        # Healthy = both present (3 triggers + table) or both absent.
        return table_exists != (trigger_count > 0)

    def _create_fts5_objects(self):
        """Create chunks_fts virtual table and the 3 sync triggers.

        Idempotent: uses IF NOT EXISTS. Caller must hold self.conn.
        """
        self.conn.execute("""
            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
                text,
                id UNINDEXED,
                user_id UNINDEXED,
                path UNINDEXED,
                source UNINDEXED,
                scope UNINDEXED,
                content='chunks',
                content_rowid='rowid'
            )
        """)
        self.conn.execute("""
            CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
                INSERT INTO chunks_fts(rowid, text, id, user_id, path, source, scope)
                VALUES (new.rowid, new.text, new.id, new.user_id, new.path, new.source, new.scope);
            END
        """)
        self.conn.execute("""
            CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
                DELETE FROM chunks_fts WHERE rowid = old.rowid;
            END
        """)
        self.conn.execute("""
            CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
                UPDATE chunks_fts SET text = new.text, id = new.id,
                                     user_id = new.user_id, path = new.path,
                                     source = new.source, scope = new.scope
                WHERE rowid = new.rowid;
            END
        """)

    def reset_fts5(self):
        """Drop and recreate chunks_fts + triggers in one transaction.

        Used by rebuild_index to recover from FTS5 shadow-table corruption
        (bm25/ORDER BY rank may raise "database disk image is malformed"
        even when raw MATCH still works).

        Triggers must be dropped first; otherwise the next chunks INSERT/DELETE
        on the existing connection will hit "no such table: chunks_fts".
        """
        if not self.fts5_available:
            return
        self.conn.execute("DROP TRIGGER IF EXISTS chunks_ai")
        self.conn.execute("DROP TRIGGER IF EXISTS chunks_ad")
        self.conn.execute("DROP TRIGGER IF EXISTS chunks_au")
        self.conn.execute("DROP TABLE IF EXISTS chunks_fts")
        self._create_fts5_objects()
        self.conn.commit()

    def _fts5_shadow_corrupt(self) -> bool:
        """Probe whether bm25 over chunks_fts errors out at startup.

        Schema (table + triggers) can be intact while the underlying
        FTS5 shadow blobs are malformed — typically because the previous
        process crashed mid-write or wrote with a different SQLite build.
        A cheap MATCH probe surfaces it immediately."""
        try:
            self.conn.execute(
                "SELECT bm25(chunks_fts) FROM chunks_fts WHERE chunks_fts MATCH 'a' LIMIT 1"
            ).fetchone()
            return False
        except sqlite3.DatabaseError as e:
            msg = str(e).lower()
            return "malformed" in msg or "corrupt" in msg
        except Exception:
            # Any other error (e.g. table missing) is handled by the
            # state-inconsistent path; treat as healthy here.
            return False

    def _rebuild_fts5_from_chunks(self):
        """Drop FTS5, recreate it, then INSERT every row from chunks.

        Safe data-wise: chunks (the content table) is the source of truth.
        Done in one transaction so a crash leaves either fully old or fully
        new state, not a partial rebuild.
        """
        # Reset schema first; this clears any malformed shadow blobs.
        self.reset_fts5()
        # Re-feed content. Triggers handle future writes automatically.
        self.conn.execute("""
            INSERT INTO chunks_fts(rowid, text, id, user_id, path, source, scope)
            SELECT rowid, text, id, user_id, path, source, scope FROM chunks
        """)
        self.conn.commit()

    def save_chunk(self, chunk: MemoryChunk):
        """Save a memory chunk (insert or update by id).

        Uses SQLite UPSERT (INSERT … ON CONFLICT DO UPDATE) instead of
        INSERT OR REPLACE.  INSERT OR REPLACE internally does DELETE+INSERT,
        which changes the row's rowid.  Because both FTS5 tables use
        content_rowid='rowid', a new rowid would leave the old FTS index
        entries pointing at a non-existent rowid and trigger
        "fts5: missing row N from content table" errors.
        ON CONFLICT DO UPDATE fires the AFTER UPDATE trigger (chunks_au /
        chunks_trigram_au) and keeps the original rowid intact.
        """
        if _HAS_UPSERT:
            _SQL = """
                INSERT INTO chunks
                (id, user_id, scope, source, path, start_line, end_line,
                 text, embedding, hash, metadata, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%s', 'now'))
                ON CONFLICT(id) DO UPDATE SET
                    user_id     = excluded.user_id,
                    scope       = excluded.scope,
                    source      = excluded.source,
                    path        = excluded.path,
                    start_line  = excluded.start_line,
                    end_line    = excluded.end_line,
                    text        = excluded.text,
                    embedding   = excluded.embedding,
                    hash        = excluded.hash,
                    metadata    = excluded.metadata,
                    updated_at  = strftime('%s', 'now')
            """
        else:
            _SQL = """
                INSERT OR REPLACE INTO chunks
                (id, user_id, scope, source, path, start_line, end_line,
                 text, embedding, hash, metadata, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%s', 'now'))
            """
        params = (
            chunk.id, chunk.user_id, chunk.scope, chunk.source, chunk.path,
            chunk.start_line, chunk.end_line, chunk.text,
            self._encode_embedding(chunk.embedding),
            chunk.hash,
            json.dumps(chunk.metadata) if chunk.metadata else None,
        )
        with self._lock:
            self.conn.execute(_SQL, params)
            self.conn.commit()

    def save_chunks_batch(self, chunks: List[MemoryChunk]):
        """Save multiple chunks in a batch (insert or update by id).

        See save_chunk for why UPSERT is used instead of INSERT OR REPLACE.
        """
        if _HAS_UPSERT:
            _SQL = """
                INSERT INTO chunks
                (id, user_id, scope, source, path, start_line, end_line,
                 text, embedding, hash, metadata, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%s', 'now'))
                ON CONFLICT(id) DO UPDATE SET
                    user_id     = excluded.user_id,
                    scope       = excluded.scope,
                    source      = excluded.source,
                    path        = excluded.path,
                    start_line  = excluded.start_line,
                    end_line    = excluded.end_line,
                    text        = excluded.text,
                    embedding   = excluded.embedding,
                    hash        = excluded.hash,
                    metadata    = excluded.metadata,
                    updated_at  = strftime('%s', 'now')
            """
        else:
            _SQL = """
                INSERT OR REPLACE INTO chunks
                (id, user_id, scope, source, path, start_line, end_line,
                 text, embedding, hash, metadata, updated_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, strftime('%s', 'now'))
            """
        params_list = [
            (
                c.id, c.user_id, c.scope, c.source, c.path,
                c.start_line, c.end_line, c.text,
                self._encode_embedding(c.embedding),
                c.hash,
                json.dumps(c.metadata) if c.metadata else None,
            )
            for c in chunks
        ]
        with self._lock:
            self.conn.executemany(_SQL, params_list)
            self.conn.commit()

    def get_chunk(self, chunk_id: str) -> Optional[MemoryChunk]:
        """Get a chunk by ID"""
        row = self.conn.execute("""
            SELECT * FROM chunks WHERE id = ?
        """, (chunk_id,)).fetchone()

        if not row:
            return None

        return self._row_to_chunk(row)

    def search_vector(
        self,
        query_embedding: List[float],
        user_id: Optional[str] = None,
        scopes: List[str] = None,
        limit: int = 10
    ) -> List[SearchResult]:
        """
        Vector similarity search using numpy-vectorized cosine similarity.
        All embeddings are loaded then scored in a single BLAS matrix-vector
        multiply, which is ~100x faster than the pure-Python per-row loop.
        """
        if scopes is None:
            scopes = ["shared"]
            if user_id:
                scopes.append("user")

        scope_placeholders = ','.join('?' * len(scopes))
        params = list(scopes)

        if user_id:
            query = f"""
                SELECT * FROM chunks
                WHERE scope IN ({scope_placeholders})
                AND (scope = 'shared' OR user_id = ?)
                AND embedding IS NOT NULL
            """
            params.append(user_id)
        else:
            query = f"""
                SELECT * FROM chunks
                WHERE scope IN ({scope_placeholders})
                AND embedding IS NOT NULL
            """

        rows = self.conn.execute(query, params).fetchall()
        if not rows:
            return []

        # Parse embeddings and build a (N, D) matrix in one pass.
        # New rows store BLOB bytes (np.frombuffer); legacy rows fall back to JSON.
        # Filter out rows whose embedding dimension differs from the query —
        # mixing dimensions would cause np.array() to produce an object array
        # and matrix @ q_vec to raise ValueError.
        expected_dim = len(query_embedding)
        valid_rows = []
        vectors = []
        for row in rows:
            vec = self._decode_embedding(row['embedding'])
            if not vec:
                continue
            if len(vec) != expected_dim:
                from common.log import logger
                logger.warning(
                    "[MemoryStorage] Skipping chunk %s: embedding dim %d != query dim %d",
                    row['id'], len(vec), expected_dim
                )
                continue
            valid_rows.append(row)
            vectors.append(vec)

        if not vectors:
            return []

        if _HAS_NUMPY:
            matrix = np.array(vectors, dtype=np.float32)        # (N, D)
            q_vec = np.array(query_embedding, dtype=np.float32)  # (D,)

            # Vectorized cosine similarity: dot(matrix, q) / (||matrix|| * ||q||)
            dots = matrix @ q_vec                                # (N,)
            row_norms = np.linalg.norm(matrix, axis=1)           # (N,)
            q_norm = float(np.linalg.norm(q_vec))
            denominators = row_norms * q_norm
            np.maximum(denominators, 1e-10, out=denominators)    # avoid div-by-zero
            sims = dots / denominators                           # (N,)

            # Select TopK using argpartition (O(N) average), then sort only those K
            k = min(limit, len(valid_rows))
            top_idx = np.argpartition(sims, -k)[-k:]
            top_idx = top_idx[np.argsort(sims[top_idx])[::-1]]

            return [
                SearchResult(
                    path=valid_rows[i]['path'],
                    start_line=valid_rows[i]['start_line'],
                    end_line=valid_rows[i]['end_line'],
                    score=float(sims[i]),
                    snippet=self._truncate_text(valid_rows[i]['text'], 500),
                    source=valid_rows[i]['source'],
                    user_id=valid_rows[i]['user_id']
                )
                for i in top_idx
                if sims[i] > 0
            ]
        else:
            # Pure-Python cosine similarity fallback (numpy not installed)
            import math
            q = query_embedding
            q_norm = math.sqrt(sum(x * x for x in q)) or 1e-10
            scored = []
            for i, vec in enumerate(vectors):
                dot = sum(a * b for a, b in zip(vec, q))
                v_norm = math.sqrt(sum(x * x for x in vec)) or 1e-10
                sim = dot / (v_norm * q_norm)
                if sim > 0:
                    scored.append((sim, valid_rows[i]))
            scored.sort(key=lambda x: x[0], reverse=True)
            return [
                SearchResult(
                    path=row['path'],
                    start_line=row['start_line'],
                    end_line=row['end_line'],
                    score=sim,
                    snippet=self._truncate_text(row['text'], 500),
                    source=row['source'],
                    user_id=row['user_id']
                )
                for sim, row in scored[:limit]
            ]

    def search_keyword(
        self,
        query: str,
        user_id: Optional[str] = None,
        scopes: List[str] = None,
        limit: int = 10
    ) -> List[SearchResult]:
        """
        Keyword search using FTS5 + LIKE fallback

        Strategy:
        1. If FTS5 available and healthy: try FTS5 first
        2. Always fall back to LIKE for CJK queries
        3. If FTS5 fails OR returns empty for non-CJK, also try LIKE so a
           broken FTS5 shadow table doesn't silently kill keyword search.
        """
        if scopes is None:
            scopes = ["shared"]
            if user_id:
                scopes.append("user")

        # Step 1: Standard FTS5 (unicode61) — pure ASCII queries only.
        # Skipped when query contains any CJK characters: unicode61 tokenises CJK
        # as individual characters without forming meaningful tokens, so it would
        # match only the ASCII portion of a mixed query (e.g. "Python" from
        # "Python教程") and silently discard the CJK part.  Those queries go
        # directly to Step 2 (trigram), which handles both ASCII and CJK together.
        fts1_attempted = False
        if (self.fts5_available
                and not MemoryStorage._contains_cjk(query)
                and MemoryStorage._build_fts_query(query)):
            fts1_attempted = True
            fts_results = self._search_fts5(query, user_id, scopes, limit)
            if fts_results:
                return fts_results

        # Step 2: Trigram FTS5 — CJK/mixed queries, plus fallback when unicode61
        # returned nothing (trigram indexes all scripts with 3-char sliding windows,
        # so it can catch terms that unicode61 tokenisation misses).
        if self.trigram_fts5_available and (
            MemoryStorage._contains_cjk(query) or fts1_attempted
        ):
            trigram_results = self._search_fts5_trigram(query, user_id, scopes, limit)
            if trigram_results:
                return trigram_results

        # Step 3: LIKE fallback — last resort (FTS5 unavailable, or CJK tokens
        # shorter than 3 characters that trigram cannot match, e.g. a single-char query).
        if not self.fts5_available or MemoryStorage._contains_cjk(query):
            return self._search_like(query, user_id, scopes, limit)

        return []

    def _search_fts5(
        self,
        query: str,
        user_id: Optional[str],
        scopes: List[str],
        limit: int
    ) -> List[SearchResult]:
        """FTS5 full-text search"""
        fts_query = self._build_fts_query(query)
        if not fts_query:
            return []

        scope_placeholders = ','.join('?' * len(scopes))
        params = [fts_query] + scopes

        if user_id:
            sql_query = f"""
                SELECT chunks.*, bm25(chunks_fts) as rank
                FROM chunks_fts
                JOIN chunks ON chunks.rowid = chunks_fts.rowid
                WHERE chunks_fts MATCH ?
                AND chunks.scope IN ({scope_placeholders})
                AND (chunks.scope = 'shared' OR chunks.user_id = ?)
                ORDER BY rank
                LIMIT ?
            """
            params.extend([user_id, limit])
        else:
            sql_query = f"""
                SELECT chunks.*, bm25(chunks_fts) as rank
                FROM chunks_fts
                JOIN chunks ON chunks.rowid = chunks_fts.rowid
                WHERE chunks_fts MATCH ?
                AND chunks.scope IN ({scope_placeholders})
                ORDER BY rank
                LIMIT ?
            """
            params.append(limit)

        try:
            rows = self.conn.execute(sql_query, params).fetchall()
            return [
                SearchResult(
                    path=row['path'],
                    start_line=row['start_line'],
                    end_line=row['end_line'],
                    score=self._bm25_rank_to_score(row['rank']),
                    snippet=self._truncate_text(row['text'], 500),
                    source=row['source'],
                    user_id=row['user_id']
                )
                for row in rows
            ]
        except Exception:
            from common.log import logger
            logger.warning("[MemoryStorage] _search_fts5 failed, returning empty", exc_info=True)
            return []

    def _search_like(
        self,
        query: str,
        user_id: Optional[str],
        scopes: List[str],
        limit: int
    ) -> List[SearchResult]:
        """LIKE-based search.

        Used as the keyword-search fallback when FTS5 is unavailable, fails,
        or returns empty. Supports both CJK runs (1+ chars) and ASCII word
        tokens (3+ chars) so it can serve as a true safety net for any query.
        """
        # CJK runs (1+ chars, wide Unicode range) + ASCII words (3+ chars to avoid noise)
        cjk_words = _RE_CJK_WORDS.findall(query)
        ascii_words = [t for t in re.findall(r'[A-Za-z0-9_]+', query) if len(t) >= 3]
        words = cjk_words + ascii_words
        if not words:
            return []

        scope_placeholders = ','.join('?' * len(scopes))

        # Build LIKE conditions for each word (case-insensitive for ASCII)
        like_conditions = []
        params = []
        for word in words:
            like_conditions.append("LOWER(text) LIKE ?")
            params.append(f'%{word.lower()}%')

        where_clause = ' OR '.join(like_conditions)
        params.extend(scopes)

        if user_id:
            sql_query = f"""
                SELECT * FROM chunks
                WHERE ({where_clause})
                AND scope IN ({scope_placeholders})
                AND (scope = 'shared' OR user_id = ?)
                LIMIT ?
            """
            params.extend([user_id, limit])
        else:
            sql_query = f"""
                SELECT * FROM chunks
                WHERE ({where_clause})
                AND scope IN ({scope_placeholders})
                LIMIT ?
            """
            params.append(limit)

        try:
            rows = self.conn.execute(sql_query, params).fetchall()
            results = []
            for row in rows:
                # Dynamic score: reward chunks that contain more of the query words.
                # Use all tokens (CJK + ASCII) so pure-ASCII queries are not skipped.
                # matched_count is always ≥1 because the WHERE clause uses OR, but
                # guard defensively so unexpected zero-match rows are never surfaced.
                text_lower = row['text'].lower()
                matched_count = sum(1 for w in words if w.lower() in text_lower)
                if matched_count == 0:
                    continue
                score = min(0.85, 0.3 + 0.15 * matched_count)
                results.append(SearchResult(
                    path=row['path'],
                    start_line=row['start_line'],
                    end_line=row['end_line'],
                    score=score,
                    snippet=self._truncate_text(row['text'], 500),
                    source=row['source'],
                    user_id=row['user_id']
                ))
            results.sort(key=lambda r: r.score, reverse=True)
            return results
        except Exception:
            from common.log import logger
            logger.warning("[MemoryStorage] _search_like failed, returning empty", exc_info=True)
            return []

    def delete_by_path(self, path: str):
        """Delete all chunks from a file"""
        with self._lock:
            self.conn.execute("DELETE FROM chunks WHERE path = ?", (path,))
            self.conn.commit()

    def get_file_hash(self, path: str) -> Optional[str]:
        """Get stored file hash"""
        row = self.conn.execute("""
            SELECT hash FROM files WHERE path = ?
        """, (path,)).fetchone()
        return row['hash'] if row else None

    def update_file_metadata(self, path: str, source: str, file_hash: str, mtime: int, size: int):
        """Update file metadata"""
        with self._lock:
            self.conn.execute("""
                INSERT OR REPLACE INTO files (path, source, hash, mtime, size, updated_at)
                VALUES (?, ?, ?, ?, ?, strftime('%s', 'now'))
            """, (path, source, file_hash, mtime, size))
            self.conn.commit()

    def get_stats(self) -> Dict[str, int]:
        """Get storage statistics"""
        chunks_count = self.conn.execute("""
            SELECT COUNT(*) as cnt FROM chunks
        """).fetchone()['cnt']

        files_count = self.conn.execute("""
            SELECT COUNT(*) as cnt FROM files
        """).fetchone()['cnt']

        embedded_count = self.conn.execute("""
            SELECT COUNT(*) as cnt FROM chunks WHERE embedding IS NOT NULL
        """).fetchone()['cnt']

        return {
            'chunks': chunks_count,
            'files': files_count,
            'embedded': embedded_count,
        }

    def close(self):
        """Close database connection"""
        if self.conn:
            try:
                self.conn.commit()  # Ensure all changes are committed
                self.conn.close()
                self.conn = None  # Mark as closed
            except Exception as e:
                from common.log import logger
                logger.warning("[MemoryStorage] Error closing database connection: %s", e)

    def __del__(self):
        """Destructor to ensure connection is closed"""
        try:
            self.close()
        except Exception:
            pass  # Ignore errors during cleanup

    # Helper methods

    @staticmethod
    def _encode_embedding(embedding: Optional[List[float]]) -> Optional[bytes]:
        """Encode embedding as float32 BLOB bytes (~6x smaller and faster than JSON).
        Falls back to struct.pack when numpy is unavailable."""
        if embedding is None:
            return None
        if _HAS_NUMPY:
            return np.array(embedding, dtype=np.float32).tobytes()
        import struct
        return struct.pack(f'{len(embedding)}f', *embedding)

    @staticmethod
    def _decode_embedding(raw) -> Optional[List[float]]:
        """Decode embedding from BLOB bytes or legacy JSON string.
        Handles both numpy and numpy-free environments."""
        if raw is None:
            return None
        if isinstance(raw, (bytes, bytearray)):
            if _HAS_NUMPY:
                return np.frombuffer(raw, dtype=np.float32).tolist()
            import struct
            n = len(raw) // 4
            return list(struct.unpack(f'{n}f', raw))
        # Legacy JSON format written by older versions
        return json.loads(raw)

    def _row_to_chunk(self, row) -> MemoryChunk:
        """Convert database row to MemoryChunk"""
        return MemoryChunk(
            id=row['id'],
            user_id=row['user_id'],
            scope=row['scope'],
            source=row['source'],
            path=row['path'],
            start_line=row['start_line'],
            end_line=row['end_line'],
            text=row['text'],
            embedding=self._decode_embedding(row['embedding']),
            hash=row['hash'],
            metadata=json.loads(row['metadata']) if row['metadata'] else None
        )

    @staticmethod
    def _contains_cjk(text: str) -> bool:
        """Check if text contains CJK or related characters (Chinese, Japanese, Korean)."""
        return bool(_RE_CONTAINS_CJK.search(text))

    @staticmethod
    def _build_trigram_query(raw_query: str) -> Optional[str]:
        """
        Build FTS5 MATCH query for the trigram tokenizer.
        Extracts CJK sequences (including single characters) and ASCII words,
        joining them with AND so all terms must appear in the matched chunk.
        """
        tokens = _RE_TRIGRAM_TOKENS.findall(raw_query)
        tokens = [t for t in tokens if t]
        if not tokens:
            return None
        # Escape embedded double-quotes (FTS5 uses "" inside quoted phrases)
        quoted = [f'"{t.replace(chr(34), chr(34)*2)}"' for t in tokens]
        return ' AND '.join(quoted)

    def _search_fts5_trigram(
        self,
        query: str,
        user_id: Optional[str],
        scopes: List[str],
        limit: int
    ) -> List[SearchResult]:
        """Trigram FTS5 search — handles CJK and mixed queries with BM25 ranking."""
        trigram_query = self._build_trigram_query(query)
        if not trigram_query:
            return []

        scope_placeholders = ','.join('?' * len(scopes))
        params = [trigram_query] + list(scopes)

        if user_id:
            sql = f"""
                SELECT chunks.*, bm25(chunks_fts_trigram) as rank
                FROM chunks_fts_trigram
                JOIN chunks ON chunks.rowid = chunks_fts_trigram.rowid
                WHERE chunks_fts_trigram MATCH ?
                AND chunks.scope IN ({scope_placeholders})
                AND (chunks.scope = 'shared' OR chunks.user_id = ?)
                ORDER BY rank
                LIMIT ?
            """
            params.extend([user_id, limit])
        else:
            sql = f"""
                SELECT chunks.*, bm25(chunks_fts_trigram) as rank
                FROM chunks_fts_trigram
                JOIN chunks ON chunks.rowid = chunks_fts_trigram.rowid
                WHERE chunks_fts_trigram MATCH ?
                AND chunks.scope IN ({scope_placeholders})
                ORDER BY rank
                LIMIT ?
            """
            params.append(limit)

        try:
            rows = self.conn.execute(sql, params).fetchall()
            return [
                SearchResult(
                    path=row['path'],
                    start_line=row['start_line'],
                    end_line=row['end_line'],
                    score=self._bm25_rank_to_score(row['rank']),
                    snippet=self._truncate_text(row['text'], 500),
                    source=row['source'],
                    user_id=row['user_id']
                )
                for row in rows
            ]
        except Exception:
            from common.log import logger
            logger.warning("[MemoryStorage] _search_fts5_trigram failed, returning empty", exc_info=True)
            return []

    @staticmethod
    def _build_fts_query(raw_query: str) -> Optional[str]:
        """
        Build FTS5 query from raw text

        Works best for English and word-based languages.
        For CJK characters, LIKE search will be used as fallback.
        """
        # Extract words (primarily English words and numbers)
        tokens = re.findall(r'[A-Za-z0-9_]+', raw_query)
        if not tokens:
            return None

        # Quote tokens for exact matching
        quoted = [f'"{t}"' for t in tokens]
        # Use OR for more flexible matching
        return ' OR '.join(quoted)

    @staticmethod
    def _bm25_rank_to_score(rank: float) -> float:
        """Convert SQLite BM25 rank to a [0, 1) relevance score.

        SQLite's bm25() returns a non-positive float (0 or negative).
        More negative = more relevant.  max(0, rank) would clip every
        negative value to 0, making every score 1/(1+0) = 1.0 and
        destroying all ranking information.

        abs(rank) / (1 + abs(rank)) maps the absolute relevance magnitude
        to [0, 1): larger |rank| (stronger match) → score closer to 1.
        """
        if rank is None:
            return 0.0
        # Add a floor of 0.3 so any FTS5 match always exceeds typical
        # min_score thresholds (default 0.1).  Small-corpus ranks close to
        # 0 would otherwise produce score≈0 and be filtered out downstream.
        return 0.3 + 0.69 * (abs(rank) / (1.0 + abs(rank)))

    @staticmethod
    def _truncate_text(text: str, max_chars: int) -> str:
        """Truncate text to max characters"""
        if len(text) <= max_chars:
            return text
        return text[:max_chars] + "..."

    @staticmethod
    def compute_hash(content: str) -> str:
        """Compute SHA256 hash of content"""
        return hashlib.sha256(content.encode('utf-8')).hexdigest()