长文本拆分，前端反馈还未成功

2026-02-20 20:10:14 +00:00 · 2025-12-12 23:59:28 +08:00
parent a39f2d9e33
commit f5dd3bfc6c
7 changed files with 605 additions and 43 deletions
--- a/backend/services/auth_service.py
+++ b/backend/services/auth_service.py
@@ -82,3 +82,42 @@ async def get_optional_user(
        return await get_current_user(credentials, db)
    except HTTPException:
        return None
+
+
+async def get_current_user_from_token(token: str, db: AsyncSession) -> User:
+    """
+    Get current user from JWT token string (for SSE with query params).
+
+    Args:
+        token: JWT token string
+        db: Database session
+
+    Returns:
+        User object
+
+    Raises:
+        Exception: If token is invalid or user not found
+    """
+    # Decode token
+    payload = decode_access_token(token)
+    if payload is None:
+        raise Exception("Invalid token")
+
+    user_id = payload.get("sub")
+    if user_id is None:
+        raise Exception("Invalid token payload")
+
+    # Convert user_id to int if it's a string
+    try:
+        user_id = int(user_id)
+    except (ValueError, TypeError):
+        raise Exception("Invalid user ID")
+
+    # Get user from database
+    result = await db.execute(select(User).where(User.id == user_id))
+    user = result.scalar_one_or_none()
+
+    if user is None:
+        raise Exception("User not found")
+
+    return user
--- a/backend/services/llm_service.py
+++ b/backend/services/llm_service.py
@@ -453,7 +453,7 @@ class LLMService:

        return chunks

-    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
+    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str, exam_id: int = None) -> List[Dict[str, Any]]:
        """
        Parse PDF document using Gemini's native PDF understanding.
        Automatically splits large PDFs into overlapping chunks.
@@ -462,6 +462,7 @@ class LLMService:
        Args:
            pdf_bytes: PDF file content as bytes
            filename: Original filename for logging
+            exam_id: Optional exam ID for progress updates

        Returns:
            List of question dictionaries
@@ -471,17 +472,44 @@ class LLMService:

        # Split PDF into chunks
        pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
+        total_chunks = len(pdf_chunks)

-        print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
+        print(f"[Gemini PDF] Processing {total_chunks} chunk(s) for {filename}")
+
+        # Send progress update if exam_id provided
+        if exam_id:
+            from services.progress_service import progress_service, ProgressUpdate, ProgressStatus
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.SPLITTING,
+                message=f"PDF已拆分为 {total_chunks} 个部分",
+                progress=15.0,
+                total_chunks=total_chunks
+            ))

        all_questions = []
        # Process each chunk with fuzzy deduplication
        for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
-            print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
+            current_chunk = chunk_idx + 1
+            chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
+
+            print(f"[Gemini PDF] Processing chunk {current_chunk}/{total_chunks}")
+
+            # Send progress update
+            if exam_id:
+                await progress_service.update_progress(ProgressUpdate(
+                    exam_id=exam_id,
+                    status=ProgressStatus.PROCESSING_CHUNK,
+                    message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
+                    progress=chunk_progress,
+                    total_chunks=total_chunks,
+                    current_chunk=current_chunk,
+                    questions_extracted=len(all_questions)
+                ))

            try:
-                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
+                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{current_chunk}")
+                print(f"[Gemini PDF] Chunk {current_chunk} extracted {len(questions)} questions")

                # Fuzzy deduplicate across chunks
                from dedup_utils import is_duplicate_question
@@ -490,15 +518,27 @@ class LLMService:
                    if not is_duplicate_question(q, all_questions, threshold=0.85):
                        all_questions.append(q)
                    else:
-                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
+                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {current_chunk}")

            except Exception as e:
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
+                print(f"[Gemini PDF] Chunk {current_chunk} failed: {str(e)}")
                # Continue with other chunks
                continue

        print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")

+        # Send final progress for PDF processing
+        if exam_id:
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.DEDUPLICATING,
+                message=f"PDF处理完成，提取了 {len(all_questions)} 个题目",
+                progress=75.0,
+                total_chunks=total_chunks,
+                current_chunk=total_chunks,
+                questions_extracted=len(all_questions)
+            ))
+
        return all_questions

    async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:
--- a/backend/services/progress_service.py
+++ b/backend/services/progress_service.py
@@ -0,0 +1,149 @@
+"""
+Progress Service - Manages document parsing progress for real-time updates
+"""
+import asyncio
+from typing import Dict, Optional, AsyncGenerator
+from datetime import datetime
+from enum import Enum
+
+
+class ProgressStatus(str, Enum):
+    """Progress status types"""
+    PENDING = "pending"
+    PARSING = "parsing"
+    SPLITTING = "splitting"
+    PROCESSING_CHUNK = "processing_chunk"
+    DEDUPLICATING = "deduplicating"
+    SAVING = "saving"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class ProgressUpdate:
+    """Progress update data structure"""
+    def __init__(
+        self,
+        exam_id: int,
+        status: ProgressStatus,
+        message: str,
+        progress: float = 0.0,
+        total_chunks: int = 0,
+        current_chunk: int = 0,
+        questions_extracted: int = 0,
+        questions_added: int = 0,
+        duplicates_removed: int = 0
+    ):
+        self.exam_id = exam_id
+        self.status = status
+        self.message = message
+        self.progress = progress  # 0-100
+        self.total_chunks = total_chunks
+        self.current_chunk = current_chunk
+        self.questions_extracted = questions_extracted
+        self.questions_added = questions_added
+        self.duplicates_removed = duplicates_removed
+        self.timestamp = datetime.now().isoformat()
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "exam_id": self.exam_id,
+            "status": self.status.value,
+            "message": self.message,
+            "progress": round(self.progress, 1),
+            "total_chunks": self.total_chunks,
+            "current_chunk": self.current_chunk,
+            "questions_extracted": self.questions_extracted,
+            "questions_added": self.questions_added,
+            "duplicates_removed": self.duplicates_removed,
+            "timestamp": self.timestamp
+        }
+
+
+class ProgressService:
+    """Service for managing parsing progress"""
+
+    def __init__(self):
+        # Store progress updates for each exam
+        self._progress: Dict[int, ProgressUpdate] = {}
+        # Store queues for SSE connections
+        self._queues: Dict[int, list] = {}
+
+    async def update_progress(self, update: ProgressUpdate):
+        """
+        Update progress for an exam and notify all listeners
+
+        Args:
+            update: Progress update object
+        """
+        exam_id = update.exam_id
+        self._progress[exam_id] = update
+
+        # Send to all connected SSE clients for this exam
+        if exam_id in self._queues:
+            dead_queues = []
+            for queue in self._queues[exam_id]:
+                try:
+                    await queue.put(update)
+                except Exception as e:
+                    print(f"[Progress] Failed to send update to queue: {e}")
+                    dead_queues.append(queue)
+
+            # Clean up dead queues
+            for dead_queue in dead_queues:
+                self._queues[exam_id].remove(dead_queue)
+
+    def get_progress(self, exam_id: int) -> Optional[ProgressUpdate]:
+        """Get current progress for an exam"""
+        return self._progress.get(exam_id)
+
+    async def subscribe(self, exam_id: int) -> AsyncGenerator[ProgressUpdate, None]:
+        """
+        Subscribe to progress updates for an exam (SSE stream)
+
+        Args:
+            exam_id: Exam ID to subscribe to
+
+        Yields:
+            Progress updates as they occur
+        """
+        # Create a queue for this connection
+        queue = asyncio.Queue()
+
+        # Register the queue
+        if exam_id not in self._queues:
+            self._queues[exam_id] = []
+        self._queues[exam_id].append(queue)
+
+        try:
+            # Send current progress if exists
+            current_progress = self.get_progress(exam_id)
+            if current_progress:
+                yield current_progress
+
+            # Stream updates
+            while True:
+                update = await queue.get()
+                yield update
+
+                # Stop streaming if completed or failed
+                if update.status in [ProgressStatus.COMPLETED, ProgressStatus.FAILED]:
+                    break
+
+        finally:
+            # Cleanup
+            if exam_id in self._queues and queue in self._queues[exam_id]:
+                self._queues[exam_id].remove(queue)
+                if not self._queues[exam_id]:
+                    del self._queues[exam_id]
+
+    def clear_progress(self, exam_id: int):
+        """Clear progress data for an exam"""
+        if exam_id in self._progress:
+            del self._progress[exam_id]
+        if exam_id in self._queues:
+            del self._queues[exam_id]
+
+
+# Singleton instance
+progress_service = ProgressService()