长文本拆分，前端反馈还未成功

2026-04-18 22:42:53 +00:00 · 2025-12-12 23:59:28 +08:00
parent a39f2d9e33
commit f5dd3bfc6c
7 changed files with 605 additions and 43 deletions
--- a/7
+++ b/7
@@ -21,15 +21,10 @@ FROM python:3.11-slim

 WORKDIR /app

-# 安装系统依赖
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    && rm -rf /var/lib/apt/lists/*
-
 # 复制后端依赖文件
 COPY backend/requirements.txt ./

-# 安装 Python 依赖
+# 安装 Python 依赖（使用预编译wheel包，无需gcc）
 RUN pip install --no-cache-dir -r requirements.txt

 # 复制后端代码
--- a/backend/routers/exam.py
+++ b/backend/routers/exam.py
@@ -2,12 +2,14 @@
 Exam Router - Handles exam creation, file upload, and deduplication
 """
 from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, Form, BackgroundTasks
+from fastapi.responses import StreamingResponse
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select, func, and_
 from typing import List, Optional
 from datetime import datetime, timedelta
 import os
 import aiofiles
+import json

 from database import get_db
 from models import User, Exam, Question, ExamStatus, SystemConfig
@@ -19,6 +21,7 @@ from services.auth_service import get_current_user
 from services.document_parser import document_parser
 from services.llm_service import LLMService
 from services.config_service import load_llm_config
+from services.progress_service import progress_service
 from utils import is_allowed_file, calculate_content_hash
 from dedup_utils import is_duplicate_question

@@ -264,9 +267,11 @@ async def async_parse_and_save(
 ):
    """
    Background task to parse document and save questions with deduplication.
+    Sends real-time progress updates via SSE.
    """
    from database import AsyncSessionLocal
    from sqlalchemy import select
+    from services.progress_service import ProgressUpdate, ProgressStatus

    async with AsyncSessionLocal() as db:
        try:
@@ -276,6 +281,14 @@ async def async_parse_and_save(
            exam.status = ExamStatus.PROCESSING
            await db.commit()

+            # Send initial progress
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.PARSING,
+                message="开始解析文档...",
+                progress=5.0
+            ))
+
            # Load LLM configuration from database
            llm_config = await load_llm_config(db)
            llm_service = LLMService(config=llm_config)
@@ -293,12 +306,27 @@ async def async_parse_and_save(
                    # Use Gemini's native PDF processing
                    print(f"[Exam {exam_id}] Using Gemini native PDF processing", flush=True)
                    print(f"[Exam {exam_id}] PDF file size: {len(file_content)} bytes", flush=True)
-                    questions_data = await llm_service.parse_document_with_pdf(file_content, filename)
+
+                    await progress_service.update_progress(ProgressUpdate(
+                        exam_id=exam_id,
+                        status=ProgressStatus.PARSING,
+                        message="使用Gemini解析PDF文档...",
+                        progress=10.0
+                    ))
+
+                    questions_data = await llm_service.parse_document_with_pdf(file_content, filename, exam_id)
                else:
                    # Extract text first, then parse
                    if is_pdf:
                        print(f"[Exam {exam_id}] ⚠️ Warning: Using text extraction for PDF (provider does not support native PDF)", flush=True)

+                    await progress_service.update_progress(ProgressUpdate(
+                        exam_id=exam_id,
+                        status=ProgressStatus.PARSING,
+                        message="提取文档文本内容...",
+                        progress=10.0
+                    ))
+
                    print(f"[Exam {exam_id}] Extracting text from document...", flush=True)
                    text_content = await document_parser.parse_file(file_content, filename)

@@ -309,17 +337,40 @@ async def async_parse_and_save(

                    # Check if document is too long and needs splitting
                    if len(text_content) > 5000:
-                        print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
                        text_chunks = document_parser.split_text_with_overlap(text_content, chunk_size=3000, overlap=1000)
-                        print(f"[Exam {exam_id}] Split into {len(text_chunks)} chunks", flush=True)
+                        total_chunks = len(text_chunks)
+
+                        print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
+                        print(f"[Exam {exam_id}] Split into {total_chunks} chunks", flush=True)
+
+                        await progress_service.update_progress(ProgressUpdate(
+                            exam_id=exam_id,
+                            status=ProgressStatus.SPLITTING,
+                            message=f"文档已拆分为 {total_chunks} 个部分",
+                            progress=15.0,
+                            total_chunks=total_chunks
+                        ))

                        all_questions = []

                        for chunk_idx, chunk in enumerate(text_chunks):
-                            print(f"[Exam {exam_id}] Processing chunk {chunk_idx + 1}/{len(text_chunks)}...", flush=True)
+                            current_chunk = chunk_idx + 1
+                            chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
+
+                            await progress_service.update_progress(ProgressUpdate(
+                                exam_id=exam_id,
+                                status=ProgressStatus.PROCESSING_CHUNK,
+                                message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
+                                progress=chunk_progress,
+                                total_chunks=total_chunks,
+                                current_chunk=current_chunk,
+                                questions_extracted=len(all_questions)
+                            ))
+
+                            print(f"[Exam {exam_id}] Processing chunk {current_chunk}/{total_chunks}...", flush=True)
                            try:
                                chunk_questions = await llm_service.parse_document(chunk)
-                                print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} extracted {len(chunk_questions)} questions", flush=True)
+                                print(f"[Exam {exam_id}] Chunk {current_chunk} extracted {len(chunk_questions)} questions", flush=True)

                                # Fuzzy deduplicate across chunks
                                for q in chunk_questions:
@@ -327,7 +378,7 @@ async def async_parse_and_save(
                                    if not is_duplicate_question(q, all_questions, threshold=0.85):
                                        all_questions.append(q)
                                    else:
-                                        print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {chunk_idx + 1}", flush=True)
+                                        print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {current_chunk}", flush=True)

                            except Exception as chunk_error:
                                print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} failed: {str(chunk_error)}", flush=True)
@@ -335,11 +386,37 @@ async def async_parse_and_save(

                        questions_data = all_questions
                        print(f"[Exam {exam_id}] Total questions after fuzzy deduplication: {len(questions_data)}", flush=True)
+
+                        await progress_service.update_progress(ProgressUpdate(
+                            exam_id=exam_id,
+                            status=ProgressStatus.DEDUPLICATING,
+                            message=f"所有部分处理完成，提取了 {len(questions_data)} 个题目",
+                            progress=75.0,
+                            total_chunks=total_chunks,
+                            current_chunk=total_chunks,
+                            questions_extracted=len(questions_data)
+                        ))
                    else:
                        print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True)
                        print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True)
+
+                        await progress_service.update_progress(ProgressUpdate(
+                            exam_id=exam_id,
+                            status=ProgressStatus.PARSING,
+                            message="正在提取题目...",
+                            progress=30.0
+                        ))
+
                        questions_data = await llm_service.parse_document(text_content)

+                        await progress_service.update_progress(ProgressUpdate(
+                            exam_id=exam_id,
+                            status=ProgressStatus.DEDUPLICATING,
+                            message=f"提取了 {len(questions_data)} 个题目",
+                            progress=60.0,
+                            questions_extracted=len(questions_data)
+                        ))
+
            except Exception as parse_error:
                print(f"[Exam {exam_id}] ⚠️ Parse error details: {type(parse_error).__name__}", flush=True)
                print(f"[Exam {exam_id}] ⚠️ Parse error message: {str(parse_error)}", flush=True)
@@ -351,6 +428,14 @@ async def async_parse_and_save(
                raise Exception("No questions found in document")

            # Process questions with deduplication and AI answer generation
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.SAVING,
+                message="正在去重并保存题目到数据库...",
+                progress=80.0,
+                questions_extracted=len(questions_data)
+            ))
+
            print(f"[Exam {exam_id}] Processing questions with deduplication...")
            parse_result = await process_questions_with_dedup(exam_id, questions_data, db, llm_service)

@@ -370,9 +455,28 @@ async def async_parse_and_save(

            print(f"[Exam {exam_id}] ✅ {parse_result.message}")

+            # Send completion progress
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.COMPLETED,
+                message=f"完成！添加了 {parse_result.new_added} 个题目（去重 {parse_result.duplicates_removed} 个）",
+                progress=100.0,
+                questions_extracted=parse_result.total_parsed,
+                questions_added=parse_result.new_added,
+                duplicates_removed=parse_result.duplicates_removed
+            ))
+
        except Exception as e:
            print(f"[Exam {exam_id}] ❌ Error: {str(e)}")

+            # Send error progress
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.FAILED,
+                message=f"处理失败：{str(e)}",
+                progress=0.0
+            ))
+
            # Update exam status to failed
            result = await db.execute(select(Exam).where(Exam.id == exam_id))
            exam = result.scalar_one()
@@ -549,6 +653,70 @@ async def get_exam_detail(
    return exam


+@router.get("/{exam_id}/progress")
+async def get_exam_progress(
+    exam_id: int,
+    token: Optional[str] = None,
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    Get real-time progress updates for exam document parsing (SSE endpoint)
+
+    Returns Server-Sent Events stream with progress updates
+    """
+    # Authenticate using token from query parameter (EventSource doesn't support custom headers)
+    from services.auth_service import get_current_user_from_token
+
+    if not token:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Token required"
+        )
+
+    try:
+        current_user = await get_current_user_from_token(token, db)
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid token"
+        )
+
+    # Verify exam belongs to user
+    result = await db.execute(
+        select(Exam).where(
+            and_(Exam.id == exam_id, Exam.user_id == current_user.id)
+        )
+    )
+    exam = result.scalar_one_or_none()
+
+    if not exam:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Exam not found"
+        )
+
+    async def event_generator():
+        """Generate SSE events"""
+        async for update in progress_service.subscribe(exam_id):
+            # Format as SSE
+            data = json.dumps(update.to_dict())
+            yield f"data: {data}\n\n"
+
+            # Stop if completed or failed
+            if update.status in ["completed", "failed"]:
+                break
+
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no"  # Disable nginx buffering
+        }
+    )
+
+
@router.delete("/{exam_id}", status_code=status.HTTP_204_NO_CONTENT)
 async def delete_exam(
    exam_id: int,
--- a/backend/services/auth_service.py
+++ b/backend/services/auth_service.py
@@ -82,3 +82,42 @@ async def get_optional_user(
        return await get_current_user(credentials, db)
    except HTTPException:
        return None
+
+
+async def get_current_user_from_token(token: str, db: AsyncSession) -> User:
+    """
+    Get current user from JWT token string (for SSE with query params).
+
+    Args:
+        token: JWT token string
+        db: Database session
+
+    Returns:
+        User object
+
+    Raises:
+        Exception: If token is invalid or user not found
+    """
+    # Decode token
+    payload = decode_access_token(token)
+    if payload is None:
+        raise Exception("Invalid token")
+
+    user_id = payload.get("sub")
+    if user_id is None:
+        raise Exception("Invalid token payload")
+
+    # Convert user_id to int if it's a string
+    try:
+        user_id = int(user_id)
+    except (ValueError, TypeError):
+        raise Exception("Invalid user ID")
+
+    # Get user from database
+    result = await db.execute(select(User).where(User.id == user_id))
+    user = result.scalar_one_or_none()
+
+    if user is None:
+        raise Exception("User not found")
+
+    return user
--- a/backend/services/llm_service.py
+++ b/backend/services/llm_service.py
@@ -453,7 +453,7 @@ class LLMService:

        return chunks

-    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
+    async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str, exam_id: int = None) -> List[Dict[str, Any]]:
        """
        Parse PDF document using Gemini's native PDF understanding.
        Automatically splits large PDFs into overlapping chunks.
@@ -462,6 +462,7 @@ class LLMService:
        Args:
            pdf_bytes: PDF file content as bytes
            filename: Original filename for logging
+            exam_id: Optional exam ID for progress updates

        Returns:
            List of question dictionaries
@@ -471,17 +472,44 @@ class LLMService:

        # Split PDF into chunks
        pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
+        total_chunks = len(pdf_chunks)

-        print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
+        print(f"[Gemini PDF] Processing {total_chunks} chunk(s) for {filename}")
+
+        # Send progress update if exam_id provided
+        if exam_id:
+            from services.progress_service import progress_service, ProgressUpdate, ProgressStatus
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.SPLITTING,
+                message=f"PDF已拆分为 {total_chunks} 个部分",
+                progress=15.0,
+                total_chunks=total_chunks
+            ))

        all_questions = []
        # Process each chunk with fuzzy deduplication
        for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
-            print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
+            current_chunk = chunk_idx + 1
+            chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
+
+            print(f"[Gemini PDF] Processing chunk {current_chunk}/{total_chunks}")
+
+            # Send progress update
+            if exam_id:
+                await progress_service.update_progress(ProgressUpdate(
+                    exam_id=exam_id,
+                    status=ProgressStatus.PROCESSING_CHUNK,
+                    message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
+                    progress=chunk_progress,
+                    total_chunks=total_chunks,
+                    current_chunk=current_chunk,
+                    questions_extracted=len(all_questions)
+                ))

            try:
-                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
+                questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{current_chunk}")
+                print(f"[Gemini PDF] Chunk {current_chunk} extracted {len(questions)} questions")

                # Fuzzy deduplicate across chunks
                from dedup_utils import is_duplicate_question
@@ -490,15 +518,27 @@ class LLMService:
                    if not is_duplicate_question(q, all_questions, threshold=0.85):
                        all_questions.append(q)
                    else:
-                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
+                        print(f"[PDF Split] Skipped fuzzy duplicate from chunk {current_chunk}")

            except Exception as e:
-                print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
+                print(f"[Gemini PDF] Chunk {current_chunk} failed: {str(e)}")
                # Continue with other chunks
                continue

        print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")

+        # Send final progress for PDF processing
+        if exam_id:
+            await progress_service.update_progress(ProgressUpdate(
+                exam_id=exam_id,
+                status=ProgressStatus.DEDUPLICATING,
+                message=f"PDF处理完成，提取了 {len(all_questions)} 个题目",
+                progress=75.0,
+                total_chunks=total_chunks,
+                current_chunk=total_chunks,
+                questions_extracted=len(all_questions)
+            ))
+
        return all_questions

    async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:
--- a/backend/services/progress_service.py
+++ b/backend/services/progress_service.py
@@ -0,0 +1,149 @@
+"""
+Progress Service - Manages document parsing progress for real-time updates
+"""
+import asyncio
+from typing import Dict, Optional, AsyncGenerator
+from datetime import datetime
+from enum import Enum
+
+
+class ProgressStatus(str, Enum):
+    """Progress status types"""
+    PENDING = "pending"
+    PARSING = "parsing"
+    SPLITTING = "splitting"
+    PROCESSING_CHUNK = "processing_chunk"
+    DEDUPLICATING = "deduplicating"
+    SAVING = "saving"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class ProgressUpdate:
+    """Progress update data structure"""
+    def __init__(
+        self,
+        exam_id: int,
+        status: ProgressStatus,
+        message: str,
+        progress: float = 0.0,
+        total_chunks: int = 0,
+        current_chunk: int = 0,
+        questions_extracted: int = 0,
+        questions_added: int = 0,
+        duplicates_removed: int = 0
+    ):
+        self.exam_id = exam_id
+        self.status = status
+        self.message = message
+        self.progress = progress  # 0-100
+        self.total_chunks = total_chunks
+        self.current_chunk = current_chunk
+        self.questions_extracted = questions_extracted
+        self.questions_added = questions_added
+        self.duplicates_removed = duplicates_removed
+        self.timestamp = datetime.now().isoformat()
+
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization"""
+        return {
+            "exam_id": self.exam_id,
+            "status": self.status.value,
+            "message": self.message,
+            "progress": round(self.progress, 1),
+            "total_chunks": self.total_chunks,
+            "current_chunk": self.current_chunk,
+            "questions_extracted": self.questions_extracted,
+            "questions_added": self.questions_added,
+            "duplicates_removed": self.duplicates_removed,
+            "timestamp": self.timestamp
+        }
+
+
+class ProgressService:
+    """Service for managing parsing progress"""
+
+    def __init__(self):
+        # Store progress updates for each exam
+        self._progress: Dict[int, ProgressUpdate] = {}
+        # Store queues for SSE connections
+        self._queues: Dict[int, list] = {}
+
+    async def update_progress(self, update: ProgressUpdate):
+        """
+        Update progress for an exam and notify all listeners
+
+        Args:
+            update: Progress update object
+        """
+        exam_id = update.exam_id
+        self._progress[exam_id] = update
+
+        # Send to all connected SSE clients for this exam
+        if exam_id in self._queues:
+            dead_queues = []
+            for queue in self._queues[exam_id]:
+                try:
+                    await queue.put(update)
+                except Exception as e:
+                    print(f"[Progress] Failed to send update to queue: {e}")
+                    dead_queues.append(queue)
+
+            # Clean up dead queues
+            for dead_queue in dead_queues:
+                self._queues[exam_id].remove(dead_queue)
+
+    def get_progress(self, exam_id: int) -> Optional[ProgressUpdate]:
+        """Get current progress for an exam"""
+        return self._progress.get(exam_id)
+
+    async def subscribe(self, exam_id: int) -> AsyncGenerator[ProgressUpdate, None]:
+        """
+        Subscribe to progress updates for an exam (SSE stream)
+
+        Args:
+            exam_id: Exam ID to subscribe to
+
+        Yields:
+            Progress updates as they occur
+        """
+        # Create a queue for this connection
+        queue = asyncio.Queue()
+
+        # Register the queue
+        if exam_id not in self._queues:
+            self._queues[exam_id] = []
+        self._queues[exam_id].append(queue)
+
+        try:
+            # Send current progress if exists
+            current_progress = self.get_progress(exam_id)
+            if current_progress:
+                yield current_progress
+
+            # Stream updates
+            while True:
+                update = await queue.get()
+                yield update
+
+                # Stop streaming if completed or failed
+                if update.status in [ProgressStatus.COMPLETED, ProgressStatus.FAILED]:
+                    break
+
+        finally:
+            # Cleanup
+            if exam_id in self._queues and queue in self._queues[exam_id]:
+                self._queues[exam_id].remove(queue)
+                if not self._queues[exam_id]:
+                    del self._queues[exam_id]
+
+    def clear_progress(self, exam_id: int):
+        """Clear progress data for an exam"""
+        if exam_id in self._progress:
+            del self._progress[exam_id]
+        if exam_id in self._queues:
+            del self._queues[exam_id]
+
+
+# Singleton instance
+progress_service = ProgressService()
--- a/frontend/src/components/ParsingProgress.jsx
+++ b/frontend/src/components/ParsingProgress.jsx
@@ -0,0 +1,121 @@
+/**
+ * Parsing Progress Component
+ * Displays real-time progress for document parsing
+ */
+import React from 'react'
+import { Loader, CheckCircle, XCircle, FileText, Layers } from 'lucide-react'
+
+export const ParsingProgress = ({ progress }) => {
+  if (!progress) return null
+
+  const { status, message, progress: percentage, total_chunks, current_chunk, questions_extracted, questions_added, duplicates_removed } = progress
+
+  const getStatusIcon = () => {
+    switch (status) {
+      case 'completed':
+        return <CheckCircle className="h-6 w-6 text-green-500" />
+      case 'failed':
+        return <XCircle className="h-6 w-6 text-red-500" />
+      default:
+        return <Loader className="h-6 w-6 text-primary-500 animate-spin" />
+    }
+  }
+
+  const getStatusColor = () => {
+    switch (status) {
+      case 'completed':
+        return 'bg-green-500'
+      case 'failed':
+        return 'bg-red-500'
+      case 'processing_chunk':
+        return 'bg-blue-500'
+      default:
+        return 'bg-primary-500'
+    }
+  }
+
+  return (
+    <div className="bg-white rounded-xl shadow-sm p-6 mb-6">
+      <div className="flex items-start gap-4">
+        <div className="flex-shrink-0">
+          {getStatusIcon()}
+        </div>
+
+        <div className="flex-1">
+          {/* Status Message */}
+          <h3 className="text-lg font-semibold text-gray-900 mb-2">
+            {status === 'completed' ? '解析完成' : status === 'failed' ? '解析失败' : '正在解析文档'}
+          </h3>
+          <p className="text-gray-600 mb-4">{message}</p>
+
+          {/* Progress Bar */}
+          {status !== 'completed' && status !== 'failed' && (
+            <div className="mb-4">
+              <div className="flex justify-between text-sm text-gray-600 mb-2">
+                <span>进度</span>
+                <span>{percentage.toFixed(0)}%</span>
+              </div>
+              <div className="w-full bg-gray-200 rounded-full h-3 overflow-hidden">
+                <div
+                  className={`h-3 ${getStatusColor()} transition-all duration-300 ease-out`}
+                  style={{ width: `${percentage}%` }}
+                ></div>
+              </div>
+            </div>
+          )}
+
+          {/* Details Grid */}
+          <div className="grid grid-cols-2 md:grid-cols-4 gap-4 mt-4">
+            {total_chunks > 0 && (
+              <div className="bg-blue-50 rounded-lg p-3">
+                <div className="flex items-center gap-2 mb-1">
+                  <Layers className="h-4 w-4 text-blue-600" />
+                  <span className="text-xs text-blue-600 font-medium">文档拆分</span>
+                </div>
+                <p className="text-lg font-bold text-blue-900">
+                  {current_chunk}/{total_chunks}
+                </p>
+                <p className="text-xs text-blue-600">部分</p>
+              </div>
+            )}
+
+            {questions_extracted > 0 && (
+              <div className="bg-purple-50 rounded-lg p-3">
+                <div className="flex items-center gap-2 mb-1">
+                  <FileText className="h-4 w-4 text-purple-600" />
+                  <span className="text-xs text-purple-600 font-medium">已提取</span>
+                </div>
+                <p className="text-lg font-bold text-purple-900">{questions_extracted}</p>
+                <p className="text-xs text-purple-600">题目</p>
+              </div>
+            )}
+
+            {questions_added > 0 && (
+              <div className="bg-green-50 rounded-lg p-3">
+                <div className="flex items-center gap-2 mb-1">
+                  <CheckCircle className="h-4 w-4 text-green-600" />
+                  <span className="text-xs text-green-600 font-medium">已添加</span>
+                </div>
+                <p className="text-lg font-bold text-green-900">{questions_added}</p>
+                <p className="text-xs text-green-600">题目</p>
+              </div>
+            )}
+
+            {duplicates_removed > 0 && (
+              <div className="bg-orange-50 rounded-lg p-3">
+                <div className="flex items-center gap-2 mb-1">
+                  <XCircle className="h-4 w-4 text-orange-600" />
+                  <span className="text-xs text-orange-600 font-medium">已去重</span>
+                </div>
+                <p className="text-lg font-bold text-orange-900">{duplicates_removed}</p>
+                <p className="text-xs text-orange-600">题目</p>
+              </div>
+            )}
+          </div>
+        </div>
+      </div>
+    </div>
+  )
+}
+
+export default ParsingProgress
--- a/frontend/src/pages/ExamDetail.jsx
+++ b/frontend/src/pages/ExamDetail.jsx
@@ -1,10 +1,11 @@
 /**
- * Exam Detail Page - with append upload and status polling
+ * Exam Detail Page - with real-time parsing progress via SSE
 */
-import React, { useState, useEffect } from 'react'
+import React, { useState, useEffect, useRef } from 'react'
 import { useParams, useNavigate } from 'react-router-dom'
 import { examAPI, questionAPI } from '../api/client'
 import Layout from '../components/Layout'
+import ParsingProgress from '../components/ParsingProgress'
 import {
  ArrowLeft, Upload, Play, Loader, FileText, AlertCircle, RefreshCw
 } from 'lucide-react'
@@ -28,16 +29,20 @@ export const ExamDetail = () => {
  const [uploading, setUploading] = useState(false)
  const [showUploadModal, setShowUploadModal] = useState(false)
  const [uploadFile, setUploadFile] = useState(null)
+  const [progress, setProgress] = useState(null)
+
+  const eventSourceRef = useRef(null)

  useEffect(() => {
    loadExamDetail()

-    // Start polling if status is processing
-    const interval = setInterval(() => {
-      pollExamStatus()
-    }, 3000)
-
-    return () => clearInterval(interval)
+    // Cleanup on unmount
+    return () => {
+      if (eventSourceRef.current) {
+        eventSourceRef.current.close()
+        eventSourceRef.current = null
+      }
+    }
  }, [examId])

  const loadExamDetail = async () => {
@@ -49,6 +54,11 @@ export const ExamDetail = () => {

      setExam(examRes.data)
      setQuestions(questionsRes.data.questions)
+
+      // Connect to SSE if exam is processing
+      if (examRes.data.status === 'processing') {
+        connectSSE()
+      }
    } catch (error) {
      console.error('Failed to load exam:', error)
      toast.error('加载题库失败')
@@ -57,22 +67,53 @@ export const ExamDetail = () => {
    }
  }

-  const pollExamStatus = async () => {
-    try {
-      const response = await examAPI.getDetail(examId)
-      const newExam = response.data
-
-      // If status changed from processing to ready
-      if (exam?.status === 'processing' && newExam.status === 'ready') {
-        toast.success('文档解析完成！')
-        await loadExamDetail() // Reload to get updated questions
-      } else if (exam?.status === 'processing' && newExam.status === 'failed') {
-        toast.error('文档解析失败')
+  const connectSSE = () => {
+    // Close existing connection if any
+    if (eventSourceRef.current) {
+      eventSourceRef.current.close()
    }

-      setExam(newExam)
+    console.log('[SSE] Connecting to progress stream for exam', examId)
+
+    const token = localStorage.getItem('token')
+    const url = `/api/exams/${examId}/progress?token=${encodeURIComponent(token)}`
+
+    const eventSource = new EventSource(url)
+    eventSourceRef.current = eventSource
+
+    eventSource.onmessage = (event) => {
+      try {
+        const progressData = JSON.parse(event.data)
+        console.log('[SSE] Progress update:', progressData)
+
+        setProgress(progressData)
+
+        // Update exam status if completed or failed
+        if (progressData.status === 'completed') {
+          toast.success(progressData.message)
+          setExam(prev => ({ ...prev, status: 'ready' }))
+          loadExamDetail() // Reload to get updated questions
+          eventSource.close()
+          eventSourceRef.current = null
+        } else if (progressData.status === 'failed') {
+          toast.error(progressData.message)
+          setExam(prev => ({ ...prev, status: 'failed' }))
+          eventSource.close()
+          eventSourceRef.current = null
+        }
      } catch (error) {
-      console.error('Failed to poll exam:', error)
+        console.error('[SSE] Failed to parse progress data:', error)
+      }
+    }
+
+    eventSource.onerror = (error) => {
+      console.error('[SSE] Connection error:', error)
+      eventSource.close()
+      eventSourceRef.current = null
+    }
+
+    eventSource.onopen = () => {
+      console.log('[SSE] Connection established')
    }
  }

@@ -96,9 +137,13 @@ export const ExamDetail = () => {
      toast.success('文档上传成功，正在解析并去重...')
      setShowUploadModal(false)
      setUploadFile(null)
-      await loadExamDetail()
+      setExam(prev => ({ ...prev, status: 'processing' }))
+
+      // Connect to SSE for real-time progress
+      connectSSE()
    } catch (error) {
      console.error('Failed to append document:', error)
+      toast.error('文档上传失败')
    } finally {
      setUploading(false)
    }
@@ -138,7 +183,7 @@ export const ExamDetail = () => {
  const isProcessing = exam.status === 'processing'
  const isReady = exam.status === 'ready'
  const isFailed = exam.status === 'failed'
-  const progress = calculateProgress(exam.current_index, exam.total_questions)
+  const quizProgress = calculateProgress(exam.current_index, exam.total_questions)

  return (
    <Layout>
@@ -152,6 +197,11 @@ export const ExamDetail = () => {
          返回题库列表
        </button>

+        {/* Parsing Progress (only shown when processing) */}
+        {isProcessing && progress && (
+          <ParsingProgress progress={progress} />
+        )}
+
        {/* Header */}
        <div className="bg-white rounded-xl shadow-sm p-6 mb-6">
          <div className="flex flex-col md:flex-row md:items-start md:justify-between mb-4">
@@ -223,7 +273,7 @@ export const ExamDetail = () => {
              <div className="w-full bg-gray-200 rounded-full h-3">
                <div
                  className="bg-primary-600 h-3 rounded-full transition-all"
-                  style={{ width: `${progress}%` }}
+                  style={{ width: `${quizProgress}%` }}
                ></div>
              </div>
            </div>