Files
chatgpt-on-wechat/agent/memory/embedding/state.py
yangluxin613 fd571ac539 fix(memory): address PR review — numpy/UPSERT soft deps + BM25 floor + BLOB dim
- numpy soft dependency: try/except import + _HAS_NUMPY flag; _encode_embedding
  and _decode_embedding fall back to struct.pack/unpack; search_vector falls back
  to pure-Python cosine loop — startup never fails without numpy reinstalled
- SQLite UPSERT guard: _HAS_UPSERT = sqlite_version_info >= (3,24,0); save_chunk
  and save_chunks_batch fall back to INSERT OR REPLACE on SQLite < 3.24 with a
  one-time startup warning about potential FTS rowid drift
- _bm25_rank_to_score floor: 0.3 + 0.69*(|rank|/(1+|rank|)) → always in [0.3, 0.99),
  prevents small-corpus matches scoring 0.0 and being filtered by min_score
- detect_index_dim BLOB-aware: check isinstance(raw, bytes) first and return
  len(raw)//4 before json.loads, so /memory status works after embedding format switch
- Comment: "CJK single-char" → "CJK tokens shorter than 3 characters"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 14:15:16 +08:00

52 lines
1.7 KiB
Python

"""
Embedding-related index utilities.
We don't keep a sidecar state file — the SQLite index is the source of truth
and config.json is the source of intent. The two functions below are the
only things needing on-disk awareness:
detect_index_dim : read the dim of stored vectors (display-only)
cleanup_legacy_state_file: remove old embedding_state.json from earlier
versions; safe no-op when absent.
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Optional, Union
PathLike = Union[str, os.PathLike]
def detect_index_dim(storage) -> Optional[int]:
"""Return the dim of the first stored embedding, or None if the index
has no embeddings. Used by /memory status."""
try:
row = storage.conn.execute(
"SELECT embedding FROM chunks WHERE embedding IS NOT NULL LIMIT 1"
).fetchone()
except Exception:
return None
if not row or not row["embedding"]:
return None
try:
raw = row["embedding"]
if isinstance(raw, (bytes, bytearray)):
# New BLOB format: 4 bytes per float32
return len(raw) // 4
emb = json.loads(raw)
return len(emb) if isinstance(emb, list) else None
except (json.JSONDecodeError, TypeError, Exception):
return None
def cleanup_legacy_state_file(db_path: PathLike) -> None:
"""Remove old embedding_state.json files from earlier versions.
Safe to call repeatedly; no-op if the file is absent."""
legacy = Path(db_path).parent / "embedding_state.json"
try:
legacy.unlink(missing_ok=True)
except Exception:
pass