超长分段处理

2026-02-20 20:10:14 +00:00 · 2025-12-12 23:16:05 +08:00
parent 62cb6d18b0
commit a39f2d9e33
9 changed files with 746 additions and 32 deletions
--- a/backend/services/document_parser.py
+++ b/backend/services/document_parser.py
@@ -3,7 +3,7 @@ Document Parser Service
 Supports: TXT, PDF, DOCX, XLSX
 """
 import io
-from typing import Optional
+from typing import Optional, List
 import PyPDF2
 from docx import Document
 import openpyxl
@@ -40,6 +40,38 @@ class DocumentParser:
        except Exception as e:
            raise Exception(f"Failed to parse PDF: {str(e)}")

+    @staticmethod
+    def split_text_with_overlap(text: str, chunk_size: int = 3000, overlap: int = 500) -> List[str]:
+        """
+        Split text into overlapping chunks for long documents.
+
+        Args:
+            text: Full text content
+            chunk_size: Characters per chunk (default: 3000)
+            overlap: Overlapping characters between chunks (default: 500)
+
+        Returns:
+            List of text chunks
+        """
+        if len(text) <= chunk_size:
+            return [text]
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            end = min(start + chunk_size, len(text))
+            chunk = text[start:end]
+            chunks.append(chunk)
+
+            print(f"[Text Split] Chunk {len(chunks)}: chars {start}-{end}")
+
+            # Move to next chunk with overlap
+            start = end - overlap if end < len(text) else len(text)
+
+        print(f"[Text Split] Total chunks: {len(chunks)}")
+        return chunks
+
    @staticmethod
    async def parse_docx(file_content: bytes) -> str:
        """Parse DOCX file"""