长文本拆分，前端反馈还未成功

2026-04-18 22:42:53 +00:00 · 2025-12-12 23:59:28 +08:00
parent a39f2d9e33
commit f5dd3bfc6c
7 changed files with 605 additions and 43 deletions
--- a/7
+++ b/7
@@ -21,15 +21,10 @@ FROM python:3.11-slim
 WORKDIR /app
 # 安装系统依赖
 RUN apt-get update && apt-get install -y \
    build-essential \
    && rm -rf /var/lib/apt/lists/*
 # 复制后端依赖文件
 COPY backend/requirements.txt ./
-# 安装 Python 依赖
+# 安装 Python 依赖（使用预编译wheel包，无需gcc）
 RUN pip install --no-cache-dir -r requirements.txt
 # 复制后端代码
--- a/backend/routers/exam.py
+++ b/backend/routers/exam.py
@@ -2,12 +2,14 @@
 Exam Router - Handles exam creation, file upload, and deduplication
 """
 from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, Form, BackgroundTasks
 from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select, func, and_
 from typing import List, Optional
 from datetime import datetime, timedelta
 import os
 import aiofiles
 import json
 from database import get_db
 from models import User, Exam, Question, ExamStatus, SystemConfig
@@ -19,6 +21,7 @@ from services.auth_service import get_current_user
 from services.document_parser import document_parser
 from services.llm_service import LLMService
 from services.config_service import load_llm_config
 from services.progress_service import progress_service
 from utils import is_allowed_file, calculate_content_hash
 from dedup_utils import is_duplicate_question
@@ -264,9 +267,11 @@ async def async_parse_and_save(
 ):
    """
    Background task to parse document and save questions with deduplication.
    Sends real-time progress updates via SSE.
    """
    from database import AsyncSessionLocal
    from sqlalchemy import select
    from services.progress_service import ProgressUpdate, ProgressStatus
    async with AsyncSessionLocal() as db:
        try:
@@ -276,6 +281,14 @@ async def async_parse_and_save(
            exam.status = ExamStatus.PROCESSING
            await db.commit()
            # Send initial progress
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.PARSING,
                message="开始解析文档...",
                progress=5.0
            ))
            # Load LLM configuration from database
            llm_config = await load_llm_config(db)
            llm_service = LLMService(config=llm_config)
@@ -293,12 +306,27 @@ async def async_parse_and_save(
                    # Use Gemini's native PDF processing
                    print(f"[Exam {exam_id}] Using Gemini native PDF processing", flush=True)
                    print(f"[Exam {exam_id}] PDF file size: {len(file_content)} bytes", flush=True)
-                    questions_data = await llm_service.parse_document_with_pdf(file_content, filename)
+
                    await progress_service.update_progress(ProgressUpdate(
                        exam_id=exam_id,
                        status=ProgressStatus.PARSING,
                        message="使用Gemini解析PDF文档...",
                        progress=10.0
                    ))
                    questions_data = await llm_service.parse_document_with_pdf(file_content, filename, exam_id)
                else:
                    # Extract text first, then parse
                    if is_pdf:
                        print(f"[Exam {exam_id}] ⚠️ Warning: Using text extraction for PDF (provider does not support native PDF)", flush=True)
                    await progress_service.update_progress(ProgressUpdate(
                        exam_id=exam_id,
                        status=ProgressStatus.PARSING,
                        message="提取文档文本内容...",
                        progress=10.0
                    ))
                    print(f"[Exam {exam_id}] Extracting text from document...", flush=True)
                    text_content = await document_parser.parse_file(file_content, filename)
@@ -309,17 +337,40 @@ async def async_parse_and_save(
                    # Check if document is too long and needs splitting
                    if len(text_content) > 5000:
                        print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
                        text_chunks = document_parser.split_text_with_overlap(text_content, chunk_size=3000, overlap=1000)
-                        print(f"[Exam {exam_id}] Split into {len(text_chunks)} chunks", flush=True)
+                        total_chunks = len(text_chunks)
                        print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
                        print(f"[Exam {exam_id}] Split into {total_chunks} chunks", flush=True)
                        await progress_service.update_progress(ProgressUpdate(
                            exam_id=exam_id,
                            status=ProgressStatus.SPLITTING,
                            message=f"文档已拆分为 {total_chunks} 个部分",
                            progress=15.0,
                            total_chunks=total_chunks
                        ))
                        all_questions = []
                        for chunk_idx, chunk in enumerate(text_chunks):
-                            print(f"[Exam {exam_id}] Processing chunk {chunk_idx + 1}/{len(text_chunks)}...", flush=True)
+                            current_chunk = chunk_idx + 1
                            chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
                            await progress_service.update_progress(ProgressUpdate(
                                exam_id=exam_id,
                                status=ProgressStatus.PROCESSING_CHUNK,
                                message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
                                progress=chunk_progress,
                                total_chunks=total_chunks,
                                current_chunk=current_chunk,
                                questions_extracted=len(all_questions)
                            ))
                            print(f"[Exam {exam_id}] Processing chunk {current_chunk}/{total_chunks}...", flush=True)
                            try:
                                chunk_questions = await llm_service.parse_document(chunk)
-                                print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} extracted {len(chunk_questions)} questions", flush=True)
+                                print(f"[Exam {exam_id}] Chunk {current_chunk} extracted {len(chunk_questions)} questions", flush=True)
                                # Fuzzy deduplicate across chunks
                                for q in chunk_questions:
@@ -327,7 +378,7 @@ async def async_parse_and_save(
                                    if not is_duplicate_question(q, all_questions, threshold=0.85):
                                        all_questions.append(q)
                                    else:
-                                        print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {chunk_idx + 1}", flush=True)
+                                        print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {current_chunk}", flush=True)
                            except Exception as chunk_error:
                                print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} failed: {str(chunk_error)}", flush=True)
@@ -335,11 +386,37 @@ async def async_parse_and_save(
                        questions_data = all_questions
                        print(f"[Exam {exam_id}] Total questions after fuzzy deduplication: {len(questions_data)}", flush=True)
                        await progress_service.update_progress(ProgressUpdate(
                            exam_id=exam_id,
                            status=ProgressStatus.DEDUPLICATING,
                            message=f"所有部分处理完成，提取了 {len(questions_data)} 个题目",
                            progress=75.0,
                            total_chunks=total_chunks,
                            current_chunk=total_chunks,
                            questions_extracted=len(questions_data)
                        ))
                    else:
                        print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True)
                        print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True)
                        await progress_service.update_progress(ProgressUpdate(
                            exam_id=exam_id,
                            status=ProgressStatus.PARSING,
                            message="正在提取题目...",
                            progress=30.0
                        ))
                        questions_data = await llm_service.parse_document(text_content)
                        await progress_service.update_progress(ProgressUpdate(
                            exam_id=exam_id,
                            status=ProgressStatus.DEDUPLICATING,
                            message=f"提取了 {len(questions_data)} 个题目",
                            progress=60.0,
                            questions_extracted=len(questions_data)
                        ))
            except Exception as parse_error:
                print(f"[Exam {exam_id}] ⚠️ Parse error details: {type(parse_error).__name__}", flush=True)
                print(f"[Exam {exam_id}] ⚠️ Parse error message: {str(parse_error)}", flush=True)
@@ -351,6 +428,14 @@ async def async_parse_and_save(
                raise Exception("No questions found in document")
            # Process questions with deduplication and AI answer generation
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.SAVING,
                message="正在去重并保存题目到数据库...",
                progress=80.0,
                questions_extracted=len(questions_data)
            ))
            print(f"[Exam {exam_id}] Processing questions with deduplication...")
            parse_result = await process_questions_with_dedup(exam_id, questions_data, db, llm_service)
@@ -370,9 +455,28 @@ async def async_parse_and_save(
            print(f"[Exam {exam_id}] ✅ {parse_result.message}")
            # Send completion progress
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.COMPLETED,
                message=f"完成！添加了 {parse_result.new_added} 个题目（去重 {parse_result.duplicates_removed} 个）",
                progress=100.0,
                questions_extracted=parse_result.total_parsed,
                questions_added=parse_result.new_added,
                duplicates_removed=parse_result.duplicates_removed
            ))
        except Exception as e:
            print(f"[Exam {exam_id}] ❌ Error: {str(e)}")
            # Send error progress
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.FAILED,
                message=f"处理失败：{str(e)}",
                progress=0.0
            ))
            # Update exam status to failed
            result = await db.execute(select(Exam).where(Exam.id == exam_id))
            exam = result.scalar_one()
@@ -549,6 +653,70 @@ async def get_exam_detail(
    return exam
@router.get("/{exam_id}/progress")
 async def get_exam_progress(
    exam_id: int,
    token: Optional[str] = None,
    db: AsyncSession = Depends(get_db)
 ):
    """
    Get real-time progress updates for exam document parsing (SSE endpoint)
    Returns Server-Sent Events stream with progress updates
    """
    # Authenticate using token from query parameter (EventSource doesn't support custom headers)
    from services.auth_service import get_current_user_from_token
    if not token:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Token required"
        )
    try:
        current_user = await get_current_user_from_token(token, db)
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid token"
        )
    # Verify exam belongs to user
    result = await db.execute(
        select(Exam).where(
            and_(Exam.id == exam_id, Exam.user_id == current_user.id)
        )
    )
    exam = result.scalar_one_or_none()
    if not exam:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail="Exam not found"
        )
    async def event_generator():
        """Generate SSE events"""
        async for update in progress_service.subscribe(exam_id):
            # Format as SSE
            data = json.dumps(update.to_dict())
            yield f"data: {data}\n\n"
            # Stop if completed or failed
            if update.status in ["completed", "failed"]:
                break
    return StreamingResponse(
        event_generator(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no"  # Disable nginx buffering
        }
    )
@router.delete("/{exam_id}", status_code=status.HTTP_204_NO_CONTENT)
 async def delete_exam(
    exam_id: int,
--- a/backend/services/auth_service.py
+++ b/backend/services/auth_service.py
@@ -82,3 +82,42 @@ async def get_optional_user(
        return await get_current_user(credentials, db)
    except HTTPException:
        return None
 async def get_current_user_from_token(token: str, db: AsyncSession) -> User:
    """
    Get current user from JWT token string (for SSE with query params).
    Args:
        token: JWT token string
        db: Database session
    Returns:
        User object
    Raises:
        Exception: If token is invalid or user not found
    """
    # Decode token
    payload = decode_access_token(token)
    if payload is None:
        raise Exception("Invalid token")
    user_id = payload.get("sub")
    if user_id is None:
        raise Exception("Invalid token payload")
    # Convert user_id to int if it's a string
    try:
        user_id = int(user_id)
    except (ValueError, TypeError):
        raise Exception("Invalid user ID")
    # Get user from database
    result = await db.execute(select(User).where(User.id == user_id))
    user = result.scalar_one_or_none()
    if user is None:
        raise Exception("User not found")
    return user
--- a/backend/services/llm_service.py
+++ b/backend/services/llm_service.py
@@ -453,7 +453,7 @@ class LLMService:
        return chunks
-    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
+    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str, exam_id: int = None) -> List[Dict[str, Any]]:
        """
        Parse PDF document using Gemini's native PDF understanding.
        Automatically splits large PDFs into overlapping chunks.
@@ -462,6 +462,7 @@ class LLMService:
        Args:
            pdf_bytes: PDF file content as bytes
            filename: Original filename for logging
            exam_id: Optional exam ID for progress updates
        Returns:
            List of question dictionaries
@@ -471,17 +472,44 @@ class LLMService:
        # Split PDF into chunks
        pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
        total_chunks = len(pdf_chunks)
-        print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
+        print(f"[Gemini PDF] Processing {total_chunks} chunk(s) for {filename}")
        # Send progress update if exam_id provided
        if exam_id:
            from services.progress_service import progress_service, ProgressUpdate, ProgressStatus
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.SPLITTING,
                message=f"PDF已拆分为 {total_chunks} 个部分",
                progress=15.0,
                total_chunks=total_chunks
            ))
        all_questions = []
        # Process each chunk with fuzzy deduplication
        for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
-            print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
+            current_chunk = chunk_idx + 1
            chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
            print(f"[Gemini PDF] Processing chunk {current_chunk}/{total_chunks}")
            # Send progress update
            if exam_id:
                await progress_service.update_progress(ProgressUpdate(
                    exam_id=exam_id,
                    status=ProgressStatus.PROCESSING_CHUNK,
                    message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
                    progress=chunk_progress,
                    total_chunks=total_chunks,
                    current_chunk=current_chunk,
                    questions_extracted=len(all_questions)
                ))
            try:
-                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
+                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{current_chunk}")
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
+                print(f"[Gemini PDF] Chunk {current_chunk} extracted {len(questions)} questions")
                # Fuzzy deduplicate across chunks
                from dedup_utils import is_duplicate_question
@@ -490,15 +518,27 @@ class LLMService:
                    if not is_duplicate_question(q, all_questions, threshold=0.85):
                        all_questions.append(q)
                    else:
-                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
+                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {current_chunk}")
            except Exception as e:
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
+                print(f"[Gemini PDF] Chunk {current_chunk} failed: {str(e)}")
                # Continue with other chunks
                continue
        print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")
        # Send final progress for PDF processing
        if exam_id:
            await progress_service.update_progress(ProgressUpdate(
                exam_id=exam_id,
                status=ProgressStatus.DEDUPLICATING,
                message=f"PDF处理完成，提取了 {len(all_questions)} 个题目",
                progress=75.0,
                total_chunks=total_chunks,
                current_chunk=total_chunks,
                questions_extracted=len(all_questions)
            ))
        return all_questions
    async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:
--- a/backend/services/progress_service.py
+++ b/backend/services/progress_service.py
@@ -0,0 +1,149 @@
 """
 Progress Service - Manages document parsing progress for real-time updates
 """
 import asyncio
 from typing import Dict, Optional, AsyncGenerator
 from datetime import datetime
 from enum import Enum
 class ProgressStatus(str, Enum):
    """Progress status types"""
    PENDING = "pending"
    PARSING = "parsing"
    SPLITTING = "splitting"
    PROCESSING_CHUNK = "processing_chunk"
    DEDUPLICATING = "deduplicating"
    SAVING = "saving"
    COMPLETED = "completed"
    FAILED = "failed"
 class ProgressUpdate:
    """Progress update data structure"""
    def __init__(
        self,
        exam_id: int,
        status: ProgressStatus,
        message: str,
        progress: float = 0.0,
        total_chunks: int = 0,
        current_chunk: int = 0,
        questions_extracted: int = 0,
        questions_added: int = 0,
        duplicates_removed: int = 0
    ):
        self.exam_id = exam_id
        self.status = status
        self.message = message
        self.progress = progress  # 0-100
        self.total_chunks = total_chunks
        self.current_chunk = current_chunk
        self.questions_extracted = questions_extracted
        self.questions_added = questions_added
        self.duplicates_removed = duplicates_removed
        self.timestamp = datetime.now().isoformat()
    def to_dict(self) -> dict:
        """Convert to dictionary for JSON serialization"""
        return {
            "exam_id": self.exam_id,
            "status": self.status.value,
            "message": self.message,
            "progress": round(self.progress, 1),
            "total_chunks": self.total_chunks,
            "current_chunk": self.current_chunk,
            "questions_extracted": self.questions_extracted,
            "questions_added": self.questions_added,
            "duplicates_removed": self.duplicates_removed,
            "timestamp": self.timestamp
        }
 class ProgressService:
    """Service for managing parsing progress"""
    def __init__(self):
        # Store progress updates for each exam
        self._progress: Dict[int, ProgressUpdate] = {}
        # Store queues for SSE connections
        self._queues: Dict[int, list] = {}
    async def update_progress(self, update: ProgressUpdate):
        """
        Update progress for an exam and notify all listeners
        Args:
            update: Progress update object
        """
        exam_id = update.exam_id
        self._progress[exam_id] = update
        # Send to all connected SSE clients for this exam
        if exam_id in self._queues:
            dead_queues = []
            for queue in self._queues[exam_id]:
                try:
                    await queue.put(update)
                except Exception as e:
                    print(f"[Progress] Failed to send update to queue: {e}")
                    dead_queues.append(queue)
            # Clean up dead queues
            for dead_queue in dead_queues:
                self._queues[exam_id].remove(dead_queue)
    def get_progress(self, exam_id: int) -> Optional[ProgressUpdate]:
        """Get current progress for an exam"""
        return self._progress.get(exam_id)
    async def subscribe(self, exam_id: int) -> AsyncGenerator[ProgressUpdate, None]:
        """
        Subscribe to progress updates for an exam (SSE stream)
        Args:
            exam_id: Exam ID to subscribe to
        Yields:
            Progress updates as they occur
        """
        # Create a queue for this connection
        queue = asyncio.Queue()
        # Register the queue
        if exam_id not in self._queues:
            self._queues[exam_id] = []
        self._queues[exam_id].append(queue)
        try:
            # Send current progress if exists
            current_progress = self.get_progress(exam_id)
            if current_progress:
                yield current_progress
            # Stream updates
            while True:
                update = await queue.get()
                yield update
                # Stop streaming if completed or failed
                if update.status in [ProgressStatus.COMPLETED, ProgressStatus.FAILED]:
                    break
        finally:
            # Cleanup
            if exam_id in self._queues and queue in self._queues[exam_id]:
                self._queues[exam_id].remove(queue)
                if not self._queues[exam_id]:
                    del self._queues[exam_id]
    def clear_progress(self, exam_id: int):
        """Clear progress data for an exam"""
        if exam_id in self._progress:
            del self._progress[exam_id]
        if exam_id in self._queues:
            del self._queues[exam_id]
 # Singleton instance
 progress_service = ProgressService()
--- a/frontend/src/components/ParsingProgress.jsx
+++ b/frontend/src/components/ParsingProgress.jsx
@@ -0,0 +1,121 @@
 /**
 * Parsing Progress Component
 * Displays real-time progress for document parsing
 */
 import React from 'react'
 import { Loader, CheckCircle, XCircle, FileText, Layers } from 'lucide-react'
 export const ParsingProgress = ({ progress }) => {
  if (!progress) return null
  const { status, message, progress: percentage, total_chunks, current_chunk, questions_extracted, questions_added, duplicates_removed } = progress
  const getStatusIcon = () => {
    switch (status) {
      case 'completed':
        return <CheckCircle className="h-6 w-6 text-green-500" />
      case 'failed':
        return <XCircle className="h-6 w-6 text-red-500" />
      default:
        return <Loader className="h-6 w-6 text-primary-500 animate-spin" />
    }
  }
  const getStatusColor = () => {
    switch (status) {
      case 'completed':
        return 'bg-green-500'
      case 'failed':
        return 'bg-red-500'
      case 'processing_chunk':
        return 'bg-blue-500'
      default:
        return 'bg-primary-500'
    }
  }
  return (
    <div className="bg-white rounded-xl shadow-sm p-6 mb-6">
      <div className="flex items-start gap-4">
        <div className="flex-shrink-0">
          {getStatusIcon()}
        </div>
        <div className="flex-1">
          {/* Status Message */}
          <h3 className="text-lg font-semibold text-gray-900 mb-2">
            {status === 'completed' ? '解析完成' : status === 'failed' ? '解析失败' : '正在解析文档'}
          </h3>
          <p className="text-gray-600 mb-4">{message}</p>
          {/* Progress Bar */}
          {status !== 'completed' && status !== 'failed' && (
            <div className="mb-4">
              <div className="flex justify-between text-sm text-gray-600 mb-2">
                <span>进度</span>
                <span>{percentage.toFixed(0)}%</span>
              </div>
              <div className="w-full bg-gray-200 rounded-full h-3 overflow-hidden">
                <div
                  className={`h-3 ${getStatusColor()} transition-all duration-300 ease-out`}
                  style={{ width: `${percentage}%` }}
                ></div>
              </div>
            </div>
          )}
          {/* Details Grid */}
          <div className="grid grid-cols-2 md:grid-cols-4 gap-4 mt-4">
            {total_chunks > 0 && (
              <div className="bg-blue-50 rounded-lg p-3">
                <div className="flex items-center gap-2 mb-1">
                  <Layers className="h-4 w-4 text-blue-600" />
                  <span className="text-xs text-blue-600 font-medium">文档拆分</span>
                </div>
                <p className="text-lg font-bold text-blue-900">
                  {current_chunk}/{total_chunks}
                </p>
                <p className="text-xs text-blue-600">部分</p>
              </div>
            )}
            {questions_extracted > 0 && (
              <div className="bg-purple-50 rounded-lg p-3">
                <div className="flex items-center gap-2 mb-1">
                  <FileText className="h-4 w-4 text-purple-600" />
                  <span className="text-xs text-purple-600 font-medium">已提取</span>
                </div>
                <p className="text-lg font-bold text-purple-900">{questions_extracted}</p>
                <p className="text-xs text-purple-600">题目</p>
              </div>
            )}
            {questions_added > 0 && (
              <div className="bg-green-50 rounded-lg p-3">
                <div className="flex items-center gap-2 mb-1">
                  <CheckCircle className="h-4 w-4 text-green-600" />
                  <span className="text-xs text-green-600 font-medium">已添加</span>
                </div>
                <p className="text-lg font-bold text-green-900">{questions_added}</p>
                <p className="text-xs text-green-600">题目</p>
              </div>
            )}
            {duplicates_removed > 0 && (
              <div className="bg-orange-50 rounded-lg p-3">
                <div className="flex items-center gap-2 mb-1">
                  <XCircle className="h-4 w-4 text-orange-600" />
                  <span className="text-xs text-orange-600 font-medium">已去重</span>
                </div>
                <p className="text-lg font-bold text-orange-900">{duplicates_removed}</p>
                <p className="text-xs text-orange-600">题目</p>
              </div>
            )}
          </div>
        </div>
      </div>
    </div>
  )
 }
 export default ParsingProgress
--- a/frontend/src/pages/ExamDetail.jsx
+++ b/frontend/src/pages/ExamDetail.jsx
@@ -1,10 +1,11 @@
 /**
- * Exam Detail Page - with append upload and status polling
+ * Exam Detail Page - with real-time parsing progress via SSE
 */
-import React, { useState, useEffect } from 'react'
+import React, { useState, useEffect, useRef } from 'react'
 import { useParams, useNavigate } from 'react-router-dom'
 import { examAPI, questionAPI } from '../api/client'
 import Layout from '../components/Layout'
 import ParsingProgress from '../components/ParsingProgress'
 import {
  ArrowLeft, Upload, Play, Loader, FileText, AlertCircle, RefreshCw
 } from 'lucide-react'
@@ -28,16 +29,20 @@ export const ExamDetail = () => {
  const [uploading, setUploading] = useState(false)
  const [showUploadModal, setShowUploadModal] = useState(false)
  const [uploadFile, setUploadFile] = useState(null)
  const [progress, setProgress] = useState(null)
  const eventSourceRef = useRef(null)
  useEffect(() => {
    loadExamDetail()
-    // Start polling if status is processing
+    // Cleanup on unmount
-    const interval = setInterval(() => {
+    return () => {
-      pollExamStatus()
+      if (eventSourceRef.current) {
-    }, 3000)
+        eventSourceRef.current.close()
-
+        eventSourceRef.current = null
-    return () => clearInterval(interval)
+      }
    }
  }, [examId])
  const loadExamDetail = async () => {
@@ -49,6 +54,11 @@ export const ExamDetail = () => {
      setExam(examRes.data)
      setQuestions(questionsRes.data.questions)
      // Connect to SSE if exam is processing
      if (examRes.data.status === 'processing') {
        connectSSE()
      }
    } catch (error) {
      console.error('Failed to load exam:', error)
      toast.error('加载题库失败')
@@ -57,22 +67,53 @@ export const ExamDetail = () => {
    }
  }
-  const pollExamStatus = async () => {
+  const connectSSE = () => {
-    try {
+    // Close existing connection if any
-      const response = await examAPI.getDetail(examId)
+    if (eventSourceRef.current) {
-      const newExam = response.data
+      eventSourceRef.current.close()
      // If status changed from processing to ready
      if (exam?.status === 'processing' && newExam.status === 'ready') {
        toast.success('文档解析完成！')
        await loadExamDetail() // Reload to get updated questions
      } else if (exam?.status === 'processing' && newExam.status === 'failed') {
        toast.error('文档解析失败')
    }
-      setExam(newExam)
+    console.log('[SSE] Connecting to progress stream for exam', examId)
    const token = localStorage.getItem('token')
    const url = `/api/exams/${examId}/progress?token=${encodeURIComponent(token)}`
    const eventSource = new EventSource(url)
    eventSourceRef.current = eventSource
    eventSource.onmessage = (event) => {
      try {
        const progressData = JSON.parse(event.data)
        console.log('[SSE] Progress update:', progressData)
        setProgress(progressData)
        // Update exam status if completed or failed
        if (progressData.status === 'completed') {
          toast.success(progressData.message)
          setExam(prev => ({ ...prev, status: 'ready' }))
          loadExamDetail() // Reload to get updated questions
          eventSource.close()
          eventSourceRef.current = null
        } else if (progressData.status === 'failed') {
          toast.error(progressData.message)
          setExam(prev => ({ ...prev, status: 'failed' }))
          eventSource.close()
          eventSourceRef.current = null
        }
      } catch (error) {
-      console.error('Failed to poll exam:', error)
+        console.error('[SSE] Failed to parse progress data:', error)
      }
    }
    eventSource.onerror = (error) => {
      console.error('[SSE] Connection error:', error)
      eventSource.close()
      eventSourceRef.current = null
    }
    eventSource.onopen = () => {
      console.log('[SSE] Connection established')
    }
  }
@@ -96,9 +137,13 @@ export const ExamDetail = () => {
      toast.success('文档上传成功，正在解析并去重...')
      setShowUploadModal(false)
      setUploadFile(null)
-      await loadExamDetail()
+      setExam(prev => ({ ...prev, status: 'processing' }))
      // Connect to SSE for real-time progress
      connectSSE()
    } catch (error) {
      console.error('Failed to append document:', error)
      toast.error('文档上传失败')
    } finally {
      setUploading(false)
    }
@@ -138,7 +183,7 @@ export const ExamDetail = () => {
  const isProcessing = exam.status === 'processing'
  const isReady = exam.status === 'ready'
  const isFailed = exam.status === 'failed'
-  const progress = calculateProgress(exam.current_index, exam.total_questions)
+  const quizProgress = calculateProgress(exam.current_index, exam.total_questions)
  return (
    <Layout>
@@ -152,6 +197,11 @@ export const ExamDetail = () => {
          返回题库列表
        </button>
        {/* Parsing Progress (only shown when processing) */}
        {isProcessing && progress && (
          <ParsingProgress progress={progress} />
        )}
        {/* Header */}
        <div className="bg-white rounded-xl shadow-sm p-6 mb-6">
          <div className="flex flex-col md:flex-row md:items-start md:justify-between mb-4">
@@ -223,7 +273,7 @@ export const ExamDetail = () => {
              <div className="w-full bg-gray-200 rounded-full h-3">
                <div
                  className="bg-primary-600 h-3 rounded-full transition-all"
-                  style={{ width: `${progress}%` }}
+                  style={{ width: `${quizProgress}%` }}
                ></div>
              </div>
            </div>