超长分段处理

2026-02-20 20:10:14 +00:00 · 2025-12-12 23:16:05 +08:00
parent 62cb6d18b0
commit a39f2d9e33
9 changed files with 746 additions and 32 deletions
--- a/backend/services/document_parser.py
+++ b/backend/services/document_parser.py
@@ -3,7 +3,7 @@ Document Parser Service
 Supports: TXT, PDF, DOCX, XLSX
 """
 import io
-from typing import Optional
+from typing import Optional, List
 import PyPDF2
 from docx import Document
 import openpyxl
@@ -40,6 +40,38 @@ class DocumentParser:
        except Exception as e:
            raise Exception(f"Failed to parse PDF: {str(e)}")

+    @staticmethod
+    def split_text_with_overlap(text: str, chunk_size: int = 3000, overlap: int = 500) -> List[str]:
+        """
+        Split text into overlapping chunks for long documents.
+
+        Args:
+            text: Full text content
+            chunk_size: Characters per chunk (default: 3000)
+            overlap: Overlapping characters between chunks (default: 500)
+
+        Returns:
+            List of text chunks
+        """
+        if len(text) <= chunk_size:
+            return [text]
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            end = min(start + chunk_size, len(text))
+            chunk = text[start:end]
+            chunks.append(chunk)
+
+            print(f"[Text Split] Chunk {len(chunks)}: chars {start}-{end}")
+
+            # Move to next chunk with overlap
+            start = end - overlap if end < len(text) else len(text)
+
+        print(f"[Text Split] Total chunks: {len(chunks)}")
+        return chunks
+
    @staticmethod
    async def parse_docx(file_content: bytes) -> str:
        """Parse DOCX file"""
--- a/backend/services/llm_service.py
+++ b/backend/services/llm_service.py
@@ -121,7 +121,8 @@ class LLMService:
 **识别规则**：
 - 文档中可能包含中文或英文题目
 - 题目可能有多种格式，请灵活识别
- 即使格式不标准，也请尽量提取题目内容
+- **重要**：只提取完整的题目，忽略任何不完整的题目（题目被截断、缺少选项、缺少关键信息等）
+- 如果题目看起来不完整（比如开头或结尾被切断），直接跳过该题目
 - 如果文档只是普通文章而没有题目，请返回空数组 []

 **题目类型识别** (严格使用以下4种类型之一)：
@@ -404,9 +405,58 @@ class LLMService:
            print(f"[Error] Document parsing failed: {str(e)}")
            raise Exception(f"Failed to parse document: {str(e)}")

+    def split_pdf_pages(self, pdf_bytes: bytes, pages_per_chunk: int = 4, overlap: int = 1) -> List[bytes]:
+        """
+        Split PDF into overlapping chunks to handle long documents.
+
+        Args:
+            pdf_bytes: PDF file content
+            pages_per_chunk: Number of pages per chunk (default: 4)
+            overlap: Number of overlapping pages between chunks (default: 1)
+
+        Returns:
+            List of PDF chunks as bytes
+        """
+        import PyPDF2
+        import io
+
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
+        total_pages = len(pdf_reader.pages)
+
+        # If PDF is small, don't split
+        if total_pages <= pages_per_chunk:
+            return [pdf_bytes]
+
+        print(f"[PDF Split] Total pages: {total_pages}, splitting into chunks of {pages_per_chunk} pages with {overlap} page overlap")
+
+        chunks = []
+        start = 0
+
+        while start < total_pages:
+            end = min(start + pages_per_chunk, total_pages)
+
+            # Create a new PDF with pages [start, end)
+            pdf_writer = PyPDF2.PdfWriter()
+            for page_num in range(start, end):
+                pdf_writer.add_page(pdf_reader.pages[page_num])
+
+            # Write to bytes
+            chunk_bytes = io.BytesIO()
+            pdf_writer.write(chunk_bytes)
+            chunk_bytes.seek(0)
+            chunks.append(chunk_bytes.getvalue())
+
+            print(f"[PDF Split] Chunk {len(chunks)}: pages {start+1}-{end}")
+
+            # Move to next chunk with overlap
+            start = end - overlap if end < total_pages else total_pages
+
+        return chunks
+
    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
        """
        Parse PDF document using Gemini's native PDF understanding.
+        Automatically splits large PDFs into overlapping chunks.
        Only works with Gemini provider.

        Args:
@@ -419,12 +469,50 @@ class LLMService:
        if self.provider != "gemini":
            raise ValueError("PDF parsing is only supported with Gemini provider")

+        # Split PDF into chunks
+        pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
+
+        print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
+
+        all_questions = []
+        # Process each chunk with fuzzy deduplication
+        for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
+            print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
+
+            try:
+                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
+                print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
+
+                # Fuzzy deduplicate across chunks
+                from dedup_utils import is_duplicate_question
+
+                for q in questions:
+                    if not is_duplicate_question(q, all_questions, threshold=0.85):
+                        all_questions.append(q)
+                    else:
+                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
+
+            except Exception as e:
+                print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
+                # Continue with other chunks
+                continue
+
+        print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")
+
+        return all_questions
+
+    async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:
+        """
+        Parse a single PDF chunk.
+        Internal method used by parse_document_with_pdf.
+        """
        prompt = """你是一个专业的试题解析专家。请仔细分析这个 PDF 文档，提取其中的所有试题。

 **识别规则**：
 - PDF 中可能包含中文或英文题目、图片、表格、公式
 - 题目可能有多种格式，请灵活识别
- 即使格式不标准，也请尽量提取题目内容
+- **重要**：只提取完整的题目，忽略任何不完整的题目（题目被截断、缺少选项、缺少关键信息等）
+- 如果题目看起来不完整（比如开头或结尾被切断），直接跳过该题目
 - 题目内容如果包含代码或换行，请将换行符替换为\\n
 - 图片中的文字也要识别并提取

@@ -492,8 +580,8 @@ class LLMService:
 - **只返回一个 JSON 数组**，不要包含其他任何内容"""

        try:
-            print(f"[Gemini PDF] Processing PDF: {filename}", flush=True)
-            print(f"[Gemini PDF] File size: {len(pdf_bytes)} bytes", flush=True)
+            print(f"[Gemini PDF] Processing chunk: {chunk_name}", flush=True)
+            print(f"[Gemini PDF] Chunk size: {len(pdf_bytes)} bytes", flush=True)

            # Use Gemini's native PDF processing via REST API
            import base64