feat: personal ai agent framework

2026-07-18 20:17:09 +08:00 · 2026-01-30 09:53:46 +08:00
parent 25cf6823d0
commit bb850bb6c5
62 changed files with 7675 additions and 275 deletions
--- a/agent/memory/chunker.py
+++ b/agent/memory/chunker.py
@@ -0,0 +1,139 @@
+"""
+Text chunking utilities for memory
+
+Splits text into chunks with token limits and overlap
+"""
+
+from typing import List, Tuple
+from dataclasses import dataclass
+
+
+@dataclass
+class TextChunk:
+    """Represents a text chunk with line numbers"""
+    text: str
+    start_line: int
+    end_line: int
+
+
+class TextChunker:
+    """Chunks text by line count with token estimation"""
+    
+    def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50):
+        """
+        Initialize chunker
+        
+        Args:
+            max_tokens: Maximum tokens per chunk
+            overlap_tokens: Overlap tokens between chunks
+        """
+        self.max_tokens = max_tokens
+        self.overlap_tokens = overlap_tokens
+        # Rough estimation: ~4 chars per token for English/Chinese mixed
+        self.chars_per_token = 4
+    
+    def chunk_text(self, text: str) -> List[TextChunk]:
+        """
+        Chunk text into overlapping segments
+        
+        Args:
+            text: Input text to chunk
+            
+        Returns:
+            List of TextChunk objects
+        """
+        if not text.strip():
+            return []
+        
+        lines = text.split('\n')
+        chunks = []
+        
+        max_chars = self.max_tokens * self.chars_per_token
+        overlap_chars = self.overlap_tokens * self.chars_per_token
+        
+        current_chunk = []
+        current_chars = 0
+        start_line = 1
+        
+        for i, line in enumerate(lines, start=1):
+            line_chars = len(line)
+            
+            # If single line exceeds max, split it
+            if line_chars > max_chars:
+                # Save current chunk if exists
+                if current_chunk:
+                    chunks.append(TextChunk(
+                        text='\n'.join(current_chunk),
+                        start_line=start_line,
+                        end_line=i - 1
+                    ))
+                    current_chunk = []
+                    current_chars = 0
+                
+                # Split long line into multiple chunks
+                for sub_chunk in self._split_long_line(line, max_chars):
+                    chunks.append(TextChunk(
+                        text=sub_chunk,
+                        start_line=i,
+                        end_line=i
+                    ))
+                
+                start_line = i + 1
+                continue
+            
+            # Check if adding this line would exceed limit
+            if current_chars + line_chars > max_chars and current_chunk:
+                # Save current chunk
+                chunks.append(TextChunk(
+                    text='\n'.join(current_chunk),
+                    start_line=start_line,
+                    end_line=i - 1
+                ))
+                
+                # Start new chunk with overlap
+                overlap_lines = self._get_overlap_lines(current_chunk, overlap_chars)
+                current_chunk = overlap_lines + [line]
+                current_chars = sum(len(l) for l in current_chunk)
+                start_line = i - len(overlap_lines)
+            else:
+                # Add line to current chunk
+                current_chunk.append(line)
+                current_chars += line_chars
+        
+        # Save last chunk
+        if current_chunk:
+            chunks.append(TextChunk(
+                text='\n'.join(current_chunk),
+                start_line=start_line,
+                end_line=len(lines)
+            ))
+        
+        return chunks
+    
+    def _split_long_line(self, line: str, max_chars: int) -> List[str]:
+        """Split a single long line into multiple chunks"""
+        chunks = []
+        for i in range(0, len(line), max_chars):
+            chunks.append(line[i:i + max_chars])
+        return chunks
+    
+    def _get_overlap_lines(self, lines: List[str], target_chars: int) -> List[str]:
+        """Get last few lines that fit within target_chars for overlap"""
+        overlap = []
+        chars = 0
+        
+        for line in reversed(lines):
+            line_chars = len(line)
+            if chars + line_chars > target_chars:
+                break
+            overlap.insert(0, line)
+            chars += line_chars
+        
+        return overlap
+    
+    def chunk_markdown(self, text: str) -> List[TextChunk]:
+        """
+        Chunk markdown text while respecting structure
+        (For future enhancement: respect markdown sections)
+        """
+        return self.chunk_text(text)