长文本拆分,前端反馈还未成功

This commit is contained in:
2025-12-12 23:59:28 +08:00
parent a39f2d9e33
commit f5dd3bfc6c
7 changed files with 605 additions and 43 deletions

View File

@@ -2,12 +2,14 @@
Exam Router - Handles exam creation, file upload, and deduplication
"""
from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, Form, BackgroundTasks
from fastapi.responses import StreamingResponse
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, func, and_
from typing import List, Optional
from datetime import datetime, timedelta
import os
import aiofiles
import json
from database import get_db
from models import User, Exam, Question, ExamStatus, SystemConfig
@@ -19,6 +21,7 @@ from services.auth_service import get_current_user
from services.document_parser import document_parser
from services.llm_service import LLMService
from services.config_service import load_llm_config
from services.progress_service import progress_service
from utils import is_allowed_file, calculate_content_hash
from dedup_utils import is_duplicate_question
@@ -264,9 +267,11 @@ async def async_parse_and_save(
):
"""
Background task to parse document and save questions with deduplication.
Sends real-time progress updates via SSE.
"""
from database import AsyncSessionLocal
from sqlalchemy import select
from services.progress_service import ProgressUpdate, ProgressStatus
async with AsyncSessionLocal() as db:
try:
@@ -276,6 +281,14 @@ async def async_parse_and_save(
exam.status = ExamStatus.PROCESSING
await db.commit()
# Send initial progress
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PARSING,
message="开始解析文档...",
progress=5.0
))
# Load LLM configuration from database
llm_config = await load_llm_config(db)
llm_service = LLMService(config=llm_config)
@@ -293,12 +306,27 @@ async def async_parse_and_save(
# Use Gemini's native PDF processing
print(f"[Exam {exam_id}] Using Gemini native PDF processing", flush=True)
print(f"[Exam {exam_id}] PDF file size: {len(file_content)} bytes", flush=True)
questions_data = await llm_service.parse_document_with_pdf(file_content, filename)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PARSING,
message="使用Gemini解析PDF文档...",
progress=10.0
))
questions_data = await llm_service.parse_document_with_pdf(file_content, filename, exam_id)
else:
# Extract text first, then parse
if is_pdf:
print(f"[Exam {exam_id}] ⚠️ Warning: Using text extraction for PDF (provider does not support native PDF)", flush=True)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PARSING,
message="提取文档文本内容...",
progress=10.0
))
print(f"[Exam {exam_id}] Extracting text from document...", flush=True)
text_content = await document_parser.parse_file(file_content, filename)
@@ -309,17 +337,40 @@ async def async_parse_and_save(
# Check if document is too long and needs splitting
if len(text_content) > 5000:
print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
text_chunks = document_parser.split_text_with_overlap(text_content, chunk_size=3000, overlap=1000)
print(f"[Exam {exam_id}] Split into {len(text_chunks)} chunks", flush=True)
total_chunks = len(text_chunks)
print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
print(f"[Exam {exam_id}] Split into {total_chunks} chunks", flush=True)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.SPLITTING,
message=f"文档已拆分为 {total_chunks} 个部分",
progress=15.0,
total_chunks=total_chunks
))
all_questions = []
for chunk_idx, chunk in enumerate(text_chunks):
print(f"[Exam {exam_id}] Processing chunk {chunk_idx + 1}/{len(text_chunks)}...", flush=True)
current_chunk = chunk_idx + 1
chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PROCESSING_CHUNK,
message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
progress=chunk_progress,
total_chunks=total_chunks,
current_chunk=current_chunk,
questions_extracted=len(all_questions)
))
print(f"[Exam {exam_id}] Processing chunk {current_chunk}/{total_chunks}...", flush=True)
try:
chunk_questions = await llm_service.parse_document(chunk)
print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} extracted {len(chunk_questions)} questions", flush=True)
print(f"[Exam {exam_id}] Chunk {current_chunk} extracted {len(chunk_questions)} questions", flush=True)
# Fuzzy deduplicate across chunks
for q in chunk_questions:
@@ -327,7 +378,7 @@ async def async_parse_and_save(
if not is_duplicate_question(q, all_questions, threshold=0.85):
all_questions.append(q)
else:
print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {chunk_idx + 1}", flush=True)
print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {current_chunk}", flush=True)
except Exception as chunk_error:
print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} failed: {str(chunk_error)}", flush=True)
@@ -335,11 +386,37 @@ async def async_parse_and_save(
questions_data = all_questions
print(f"[Exam {exam_id}] Total questions after fuzzy deduplication: {len(questions_data)}", flush=True)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.DEDUPLICATING,
message=f"所有部分处理完成,提取了 {len(questions_data)} 个题目",
progress=75.0,
total_chunks=total_chunks,
current_chunk=total_chunks,
questions_extracted=len(questions_data)
))
else:
print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True)
print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PARSING,
message="正在提取题目...",
progress=30.0
))
questions_data = await llm_service.parse_document(text_content)
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.DEDUPLICATING,
message=f"提取了 {len(questions_data)} 个题目",
progress=60.0,
questions_extracted=len(questions_data)
))
except Exception as parse_error:
print(f"[Exam {exam_id}] ⚠️ Parse error details: {type(parse_error).__name__}", flush=True)
print(f"[Exam {exam_id}] ⚠️ Parse error message: {str(parse_error)}", flush=True)
@@ -351,6 +428,14 @@ async def async_parse_and_save(
raise Exception("No questions found in document")
# Process questions with deduplication and AI answer generation
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.SAVING,
message="正在去重并保存题目到数据库...",
progress=80.0,
questions_extracted=len(questions_data)
))
print(f"[Exam {exam_id}] Processing questions with deduplication...")
parse_result = await process_questions_with_dedup(exam_id, questions_data, db, llm_service)
@@ -370,9 +455,28 @@ async def async_parse_and_save(
print(f"[Exam {exam_id}] ✅ {parse_result.message}")
# Send completion progress
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.COMPLETED,
message=f"完成!添加了 {parse_result.new_added} 个题目(去重 {parse_result.duplicates_removed} 个)",
progress=100.0,
questions_extracted=parse_result.total_parsed,
questions_added=parse_result.new_added,
duplicates_removed=parse_result.duplicates_removed
))
except Exception as e:
print(f"[Exam {exam_id}] ❌ Error: {str(e)}")
# Send error progress
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.FAILED,
message=f"处理失败:{str(e)}",
progress=0.0
))
# Update exam status to failed
result = await db.execute(select(Exam).where(Exam.id == exam_id))
exam = result.scalar_one()
@@ -549,6 +653,70 @@ async def get_exam_detail(
return exam
@router.get("/{exam_id}/progress")
async def get_exam_progress(
exam_id: int,
token: Optional[str] = None,
db: AsyncSession = Depends(get_db)
):
"""
Get real-time progress updates for exam document parsing (SSE endpoint)
Returns Server-Sent Events stream with progress updates
"""
# Authenticate using token from query parameter (EventSource doesn't support custom headers)
from services.auth_service import get_current_user_from_token
if not token:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Token required"
)
try:
current_user = await get_current_user_from_token(token, db)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid token"
)
# Verify exam belongs to user
result = await db.execute(
select(Exam).where(
and_(Exam.id == exam_id, Exam.user_id == current_user.id)
)
)
exam = result.scalar_one_or_none()
if not exam:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Exam not found"
)
async def event_generator():
"""Generate SSE events"""
async for update in progress_service.subscribe(exam_id):
# Format as SSE
data = json.dumps(update.to_dict())
yield f"data: {data}\n\n"
# Stop if completed or failed
if update.status in ["completed", "failed"]:
break
return StreamingResponse(
event_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no" # Disable nginx buffering
}
)
@router.delete("/{exam_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_exam(
exam_id: int,