长文本拆分,前端反馈还未成功

This commit is contained in:
2025-12-12 23:59:28 +08:00
parent a39f2d9e33
commit f5dd3bfc6c
7 changed files with 605 additions and 43 deletions

View File

@@ -82,3 +82,42 @@ async def get_optional_user(
return await get_current_user(credentials, db)
except HTTPException:
return None
async def get_current_user_from_token(token: str, db: AsyncSession) -> User:
"""
Get current user from JWT token string (for SSE with query params).
Args:
token: JWT token string
db: Database session
Returns:
User object
Raises:
Exception: If token is invalid or user not found
"""
# Decode token
payload = decode_access_token(token)
if payload is None:
raise Exception("Invalid token")
user_id = payload.get("sub")
if user_id is None:
raise Exception("Invalid token payload")
# Convert user_id to int if it's a string
try:
user_id = int(user_id)
except (ValueError, TypeError):
raise Exception("Invalid user ID")
# Get user from database
result = await db.execute(select(User).where(User.id == user_id))
user = result.scalar_one_or_none()
if user is None:
raise Exception("User not found")
return user

View File

@@ -453,7 +453,7 @@ class LLMService:
return chunks
async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str, exam_id: int = None) -> List[Dict[str, Any]]:
"""
Parse PDF document using Gemini's native PDF understanding.
Automatically splits large PDFs into overlapping chunks.
@@ -462,6 +462,7 @@ class LLMService:
Args:
pdf_bytes: PDF file content as bytes
filename: Original filename for logging
exam_id: Optional exam ID for progress updates
Returns:
List of question dictionaries
@@ -471,17 +472,44 @@ class LLMService:
# Split PDF into chunks
pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
total_chunks = len(pdf_chunks)
print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
print(f"[Gemini PDF] Processing {total_chunks} chunk(s) for {filename}")
# Send progress update if exam_id provided
if exam_id:
from services.progress_service import progress_service, ProgressUpdate, ProgressStatus
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.SPLITTING,
message=f"PDF已拆分为 {total_chunks} 个部分",
progress=15.0,
total_chunks=total_chunks
))
all_questions = []
# Process each chunk with fuzzy deduplication
for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
current_chunk = chunk_idx + 1
chunk_progress = 15.0 + (60.0 * current_chunk / total_chunks)
print(f"[Gemini PDF] Processing chunk {current_chunk}/{total_chunks}")
# Send progress update
if exam_id:
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.PROCESSING_CHUNK,
message=f"正在处理第 {current_chunk}/{total_chunks} 部分...",
progress=chunk_progress,
total_chunks=total_chunks,
current_chunk=current_chunk,
questions_extracted=len(all_questions)
))
try:
questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{current_chunk}")
print(f"[Gemini PDF] Chunk {current_chunk} extracted {len(questions)} questions")
# Fuzzy deduplicate across chunks
from dedup_utils import is_duplicate_question
@@ -490,15 +518,27 @@ class LLMService:
if not is_duplicate_question(q, all_questions, threshold=0.85):
all_questions.append(q)
else:
print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
print(f"[PDF Split] Skipped fuzzy duplicate from chunk {current_chunk}")
except Exception as e:
print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
print(f"[Gemini PDF] Chunk {current_chunk} failed: {str(e)}")
# Continue with other chunks
continue
print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")
# Send final progress for PDF processing
if exam_id:
await progress_service.update_progress(ProgressUpdate(
exam_id=exam_id,
status=ProgressStatus.DEDUPLICATING,
message=f"PDF处理完成提取了 {len(all_questions)} 个题目",
progress=75.0,
total_chunks=total_chunks,
current_chunk=total_chunks,
questions_extracted=len(all_questions)
))
return all_questions
async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:

View File

@@ -0,0 +1,149 @@
"""
Progress Service - Manages document parsing progress for real-time updates
"""
import asyncio
from typing import Dict, Optional, AsyncGenerator
from datetime import datetime
from enum import Enum
class ProgressStatus(str, Enum):
"""Progress status types"""
PENDING = "pending"
PARSING = "parsing"
SPLITTING = "splitting"
PROCESSING_CHUNK = "processing_chunk"
DEDUPLICATING = "deduplicating"
SAVING = "saving"
COMPLETED = "completed"
FAILED = "failed"
class ProgressUpdate:
"""Progress update data structure"""
def __init__(
self,
exam_id: int,
status: ProgressStatus,
message: str,
progress: float = 0.0,
total_chunks: int = 0,
current_chunk: int = 0,
questions_extracted: int = 0,
questions_added: int = 0,
duplicates_removed: int = 0
):
self.exam_id = exam_id
self.status = status
self.message = message
self.progress = progress # 0-100
self.total_chunks = total_chunks
self.current_chunk = current_chunk
self.questions_extracted = questions_extracted
self.questions_added = questions_added
self.duplicates_removed = duplicates_removed
self.timestamp = datetime.now().isoformat()
def to_dict(self) -> dict:
"""Convert to dictionary for JSON serialization"""
return {
"exam_id": self.exam_id,
"status": self.status.value,
"message": self.message,
"progress": round(self.progress, 1),
"total_chunks": self.total_chunks,
"current_chunk": self.current_chunk,
"questions_extracted": self.questions_extracted,
"questions_added": self.questions_added,
"duplicates_removed": self.duplicates_removed,
"timestamp": self.timestamp
}
class ProgressService:
"""Service for managing parsing progress"""
def __init__(self):
# Store progress updates for each exam
self._progress: Dict[int, ProgressUpdate] = {}
# Store queues for SSE connections
self._queues: Dict[int, list] = {}
async def update_progress(self, update: ProgressUpdate):
"""
Update progress for an exam and notify all listeners
Args:
update: Progress update object
"""
exam_id = update.exam_id
self._progress[exam_id] = update
# Send to all connected SSE clients for this exam
if exam_id in self._queues:
dead_queues = []
for queue in self._queues[exam_id]:
try:
await queue.put(update)
except Exception as e:
print(f"[Progress] Failed to send update to queue: {e}")
dead_queues.append(queue)
# Clean up dead queues
for dead_queue in dead_queues:
self._queues[exam_id].remove(dead_queue)
def get_progress(self, exam_id: int) -> Optional[ProgressUpdate]:
"""Get current progress for an exam"""
return self._progress.get(exam_id)
async def subscribe(self, exam_id: int) -> AsyncGenerator[ProgressUpdate, None]:
"""
Subscribe to progress updates for an exam (SSE stream)
Args:
exam_id: Exam ID to subscribe to
Yields:
Progress updates as they occur
"""
# Create a queue for this connection
queue = asyncio.Queue()
# Register the queue
if exam_id not in self._queues:
self._queues[exam_id] = []
self._queues[exam_id].append(queue)
try:
# Send current progress if exists
current_progress = self.get_progress(exam_id)
if current_progress:
yield current_progress
# Stream updates
while True:
update = await queue.get()
yield update
# Stop streaming if completed or failed
if update.status in [ProgressStatus.COMPLETED, ProgressStatus.FAILED]:
break
finally:
# Cleanup
if exam_id in self._queues and queue in self._queues[exam_id]:
self._queues[exam_id].remove(queue)
if not self._queues[exam_id]:
del self._queues[exam_id]
def clear_progress(self, exam_id: int):
"""Clear progress data for an exam"""
if exam_id in self._progress:
del self._progress[exam_id]
if exam_id in self._queues:
del self._queues[exam_id]
# Singleton instance
progress_service = ProgressService()