mirror of
https://github.com/handsomezhuzhu/QQuiz.git
synced 2026-02-20 12:00:14 +00:00
超长分段处理
This commit is contained in:
142
backend/dedup_utils.py
Normal file
142
backend/dedup_utils.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Question Deduplication Utilities
|
||||
Provides fuzzy matching algorithms to handle AI-generated variations
|
||||
"""
|
||||
import difflib
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""
|
||||
Normalize text for comparison by removing extra whitespace and punctuation variations.
|
||||
|
||||
Args:
|
||||
text: Input text to normalize
|
||||
|
||||
Returns:
|
||||
Normalized text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
# Convert to lowercase
|
||||
text = text.lower()
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
# Remove common punctuation variations (Chinese/English)
|
||||
text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
|
||||
text = text.replace(':', ':').replace(';', ';').replace('"', '"').replace('"', '"')
|
||||
# Strip leading/trailing whitespace
|
||||
return text.strip()
|
||||
|
||||
|
||||
def calculate_similarity(text1: str, text2: str) -> float:
|
||||
"""
|
||||
Calculate similarity between two texts using multiple methods.
|
||||
|
||||
Uses a combination of:
|
||||
1. SequenceMatcher for character-level similarity (70% weight)
|
||||
2. Jaccard similarity for word-level matching (30% weight)
|
||||
|
||||
Args:
|
||||
text1: First text
|
||||
text2: Second text
|
||||
|
||||
Returns:
|
||||
Similarity score between 0.0 and 1.0
|
||||
"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
# Normalize texts
|
||||
norm_text1 = normalize_text(text1)
|
||||
norm_text2 = normalize_text(text2)
|
||||
|
||||
# Exact match after normalization
|
||||
if norm_text1 == norm_text2:
|
||||
return 1.0
|
||||
|
||||
# 1. Character-level similarity using SequenceMatcher (handles typos, minor variations)
|
||||
char_similarity = difflib.SequenceMatcher(None, norm_text1, norm_text2).ratio()
|
||||
|
||||
# 2. Word-level Jaccard similarity (handles word reordering, additions/deletions)
|
||||
words1 = set(norm_text1.split())
|
||||
words2 = set(norm_text2.split())
|
||||
|
||||
if not words1 or not words2:
|
||||
return char_similarity
|
||||
|
||||
intersection = words1.intersection(words2)
|
||||
union = words1.union(words2)
|
||||
jaccard_similarity = len(intersection) / len(union) if union else 0.0
|
||||
|
||||
# Weighted average (character similarity matters more for exact question matching)
|
||||
final_similarity = 0.7 * char_similarity + 0.3 * jaccard_similarity
|
||||
|
||||
return final_similarity
|
||||
|
||||
|
||||
def is_duplicate_question(
|
||||
new_question: Dict[str, Any],
|
||||
existing_questions: List[Dict[str, Any]],
|
||||
threshold: float = 0.85
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a question is duplicate using fuzzy matching.
|
||||
|
||||
Handles AI-generated variations where the same question might have:
|
||||
- Minor wording differences
|
||||
- Extra/missing punctuation
|
||||
- Different whitespace
|
||||
- Slight paraphrasing
|
||||
|
||||
Args:
|
||||
new_question: Question to check (dict with 'content' key)
|
||||
existing_questions: List of questions already processed
|
||||
threshold: Similarity threshold (0.85 = 85% similar is considered duplicate)
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
new_content = new_question.get('content', '')
|
||||
if not new_content:
|
||||
return False
|
||||
|
||||
for existing_q in existing_questions:
|
||||
existing_content = existing_q.get('content', '')
|
||||
if not existing_content:
|
||||
continue
|
||||
|
||||
similarity = calculate_similarity(new_content, existing_content)
|
||||
|
||||
if similarity >= threshold:
|
||||
print(f"[Fuzzy Dedup] Found duplicate (similarity: {similarity:.2%})", flush=True)
|
||||
print(f" New: {new_content[:60]}...", flush=True)
|
||||
print(f" Existing: {existing_content[:60]}...", flush=True)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def deduplicate_questions(
|
||||
questions: List[Dict[str, Any]],
|
||||
threshold: float = 0.85
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Remove duplicate questions from a list using fuzzy matching.
|
||||
|
||||
Args:
|
||||
questions: List of questions to deduplicate
|
||||
threshold: Similarity threshold for fuzzy matching
|
||||
|
||||
Returns:
|
||||
List of unique questions
|
||||
"""
|
||||
unique_questions = []
|
||||
|
||||
for q in questions:
|
||||
if not is_duplicate_question(q, unique_questions, threshold):
|
||||
unique_questions.append(q)
|
||||
|
||||
print(f"[Dedup] Reduced from {len(questions)} to {len(unique_questions)} questions")
|
||||
return unique_questions
|
||||
@@ -20,6 +20,7 @@ from services.document_parser import document_parser
|
||||
from services.llm_service import LLMService
|
||||
from services.config_service import load_llm_config
|
||||
from utils import is_allowed_file, calculate_content_hash
|
||||
from dedup_utils import is_duplicate_question
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -154,7 +155,11 @@ async def process_questions_with_dedup(
|
||||
llm_service=None
|
||||
) -> ParseResult:
|
||||
"""
|
||||
Process parsed questions with deduplication logic.
|
||||
Process parsed questions with fuzzy deduplication logic.
|
||||
|
||||
Uses a two-stage deduplication strategy:
|
||||
1. Fast exact hash matching (for 100% identical questions)
|
||||
2. Fuzzy similarity matching (for AI-generated variations)
|
||||
|
||||
Args:
|
||||
exam_id: Target exam ID
|
||||
@@ -170,17 +175,28 @@ async def process_questions_with_dedup(
|
||||
new_added = 0
|
||||
ai_answers_generated = 0
|
||||
|
||||
# Get existing content hashes for this exam
|
||||
# Get existing questions for this exam (content for fuzzy matching)
|
||||
result = await db.execute(
|
||||
select(Question.content_hash).where(Question.exam_id == exam_id)
|
||||
select(Question.content, Question.content_hash).where(Question.exam_id == exam_id)
|
||||
)
|
||||
existing_hashes = set(row[0] for row in result.all())
|
||||
existing_questions_db = result.all()
|
||||
existing_hashes = set(row[1] for row in existing_questions_db)
|
||||
existing_questions = [{"content": row[0]} for row in existing_questions_db]
|
||||
|
||||
print(f"[Dedup] Checking against {len(existing_questions)} existing questions in database")
|
||||
|
||||
# Insert only new questions
|
||||
for q_data in questions_data:
|
||||
content_hash = q_data.get("content_hash")
|
||||
|
||||
# Stage 1: Fast exact hash matching
|
||||
if content_hash in existing_hashes:
|
||||
duplicates_removed += 1
|
||||
print(f"[Dedup] Exact hash match - skipping", flush=True)
|
||||
continue
|
||||
|
||||
# Stage 2: Fuzzy similarity matching (only if hash didn't match)
|
||||
if is_duplicate_question(q_data, existing_questions, threshold=0.85):
|
||||
duplicates_removed += 1
|
||||
continue
|
||||
|
||||
@@ -222,7 +238,8 @@ async def process_questions_with_dedup(
|
||||
content_hash=content_hash
|
||||
)
|
||||
db.add(new_question)
|
||||
existing_hashes.add(content_hash) # Add to set to prevent duplicates in current batch
|
||||
existing_hashes.add(content_hash) # Prevent exact duplicates in current batch
|
||||
existing_questions.append({"content": q_data["content"]}) # Prevent fuzzy duplicates in current batch
|
||||
new_added += 1
|
||||
|
||||
await db.commit()
|
||||
@@ -289,10 +306,39 @@ async def async_parse_and_save(
|
||||
raise Exception("Document appears to be empty or too short")
|
||||
|
||||
print(f"[Exam {exam_id}] Text content length: {len(text_content)} chars", flush=True)
|
||||
print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True)
|
||||
print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True)
|
||||
|
||||
questions_data = await llm_service.parse_document(text_content)
|
||||
# Check if document is too long and needs splitting
|
||||
if len(text_content) > 5000:
|
||||
print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True)
|
||||
text_chunks = document_parser.split_text_with_overlap(text_content, chunk_size=3000, overlap=1000)
|
||||
print(f"[Exam {exam_id}] Split into {len(text_chunks)} chunks", flush=True)
|
||||
|
||||
all_questions = []
|
||||
|
||||
for chunk_idx, chunk in enumerate(text_chunks):
|
||||
print(f"[Exam {exam_id}] Processing chunk {chunk_idx + 1}/{len(text_chunks)}...", flush=True)
|
||||
try:
|
||||
chunk_questions = await llm_service.parse_document(chunk)
|
||||
print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} extracted {len(chunk_questions)} questions", flush=True)
|
||||
|
||||
# Fuzzy deduplicate across chunks
|
||||
for q in chunk_questions:
|
||||
# Use fuzzy matching to check for duplicates
|
||||
if not is_duplicate_question(q, all_questions, threshold=0.85):
|
||||
all_questions.append(q)
|
||||
else:
|
||||
print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {chunk_idx + 1}", flush=True)
|
||||
|
||||
except Exception as chunk_error:
|
||||
print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} failed: {str(chunk_error)}", flush=True)
|
||||
continue
|
||||
|
||||
questions_data = all_questions
|
||||
print(f"[Exam {exam_id}] Total questions after fuzzy deduplication: {len(questions_data)}", flush=True)
|
||||
else:
|
||||
print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True)
|
||||
print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True)
|
||||
questions_data = await llm_service.parse_document(text_content)
|
||||
|
||||
except Exception as parse_error:
|
||||
print(f"[Exam {exam_id}] ⚠️ Parse error details: {type(parse_error).__name__}", flush=True)
|
||||
|
||||
@@ -3,7 +3,7 @@ Document Parser Service
|
||||
Supports: TXT, PDF, DOCX, XLSX
|
||||
"""
|
||||
import io
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
import PyPDF2
|
||||
from docx import Document
|
||||
import openpyxl
|
||||
@@ -40,6 +40,38 @@ class DocumentParser:
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to parse PDF: {str(e)}")
|
||||
|
||||
@staticmethod
|
||||
def split_text_with_overlap(text: str, chunk_size: int = 3000, overlap: int = 500) -> List[str]:
|
||||
"""
|
||||
Split text into overlapping chunks for long documents.
|
||||
|
||||
Args:
|
||||
text: Full text content
|
||||
chunk_size: Characters per chunk (default: 3000)
|
||||
overlap: Overlapping characters between chunks (default: 500)
|
||||
|
||||
Returns:
|
||||
List of text chunks
|
||||
"""
|
||||
if len(text) <= chunk_size:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = min(start + chunk_size, len(text))
|
||||
chunk = text[start:end]
|
||||
chunks.append(chunk)
|
||||
|
||||
print(f"[Text Split] Chunk {len(chunks)}: chars {start}-{end}")
|
||||
|
||||
# Move to next chunk with overlap
|
||||
start = end - overlap if end < len(text) else len(text)
|
||||
|
||||
print(f"[Text Split] Total chunks: {len(chunks)}")
|
||||
return chunks
|
||||
|
||||
@staticmethod
|
||||
async def parse_docx(file_content: bytes) -> str:
|
||||
"""Parse DOCX file"""
|
||||
|
||||
@@ -121,7 +121,8 @@ class LLMService:
|
||||
**识别规则**:
|
||||
- 文档中可能包含中文或英文题目
|
||||
- 题目可能有多种格式,请灵活识别
|
||||
- 即使格式不标准,也请尽量提取题目内容
|
||||
- **重要**:只提取完整的题目,忽略任何不完整的题目(题目被截断、缺少选项、缺少关键信息等)
|
||||
- 如果题目看起来不完整(比如开头或结尾被切断),直接跳过该题目
|
||||
- 如果文档只是普通文章而没有题目,请返回空数组 []
|
||||
|
||||
**题目类型识别** (严格使用以下4种类型之一):
|
||||
@@ -404,9 +405,58 @@ class LLMService:
|
||||
print(f"[Error] Document parsing failed: {str(e)}")
|
||||
raise Exception(f"Failed to parse document: {str(e)}")
|
||||
|
||||
def split_pdf_pages(self, pdf_bytes: bytes, pages_per_chunk: int = 4, overlap: int = 1) -> List[bytes]:
|
||||
"""
|
||||
Split PDF into overlapping chunks to handle long documents.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file content
|
||||
pages_per_chunk: Number of pages per chunk (default: 4)
|
||||
overlap: Number of overlapping pages between chunks (default: 1)
|
||||
|
||||
Returns:
|
||||
List of PDF chunks as bytes
|
||||
"""
|
||||
import PyPDF2
|
||||
import io
|
||||
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
||||
total_pages = len(pdf_reader.pages)
|
||||
|
||||
# If PDF is small, don't split
|
||||
if total_pages <= pages_per_chunk:
|
||||
return [pdf_bytes]
|
||||
|
||||
print(f"[PDF Split] Total pages: {total_pages}, splitting into chunks of {pages_per_chunk} pages with {overlap} page overlap")
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < total_pages:
|
||||
end = min(start + pages_per_chunk, total_pages)
|
||||
|
||||
# Create a new PDF with pages [start, end)
|
||||
pdf_writer = PyPDF2.PdfWriter()
|
||||
for page_num in range(start, end):
|
||||
pdf_writer.add_page(pdf_reader.pages[page_num])
|
||||
|
||||
# Write to bytes
|
||||
chunk_bytes = io.BytesIO()
|
||||
pdf_writer.write(chunk_bytes)
|
||||
chunk_bytes.seek(0)
|
||||
chunks.append(chunk_bytes.getvalue())
|
||||
|
||||
print(f"[PDF Split] Chunk {len(chunks)}: pages {start+1}-{end}")
|
||||
|
||||
# Move to next chunk with overlap
|
||||
start = end - overlap if end < total_pages else total_pages
|
||||
|
||||
return chunks
|
||||
|
||||
async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse PDF document using Gemini's native PDF understanding.
|
||||
Automatically splits large PDFs into overlapping chunks.
|
||||
Only works with Gemini provider.
|
||||
|
||||
Args:
|
||||
@@ -419,12 +469,50 @@ class LLMService:
|
||||
if self.provider != "gemini":
|
||||
raise ValueError("PDF parsing is only supported with Gemini provider")
|
||||
|
||||
# Split PDF into chunks
|
||||
pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1)
|
||||
|
||||
print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}")
|
||||
|
||||
all_questions = []
|
||||
# Process each chunk with fuzzy deduplication
|
||||
for chunk_idx, chunk_bytes in enumerate(pdf_chunks):
|
||||
print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}")
|
||||
|
||||
try:
|
||||
questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}")
|
||||
print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions")
|
||||
|
||||
# Fuzzy deduplicate across chunks
|
||||
from dedup_utils import is_duplicate_question
|
||||
|
||||
for q in questions:
|
||||
if not is_duplicate_question(q, all_questions, threshold=0.85):
|
||||
all_questions.append(q)
|
||||
else:
|
||||
print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}")
|
||||
# Continue with other chunks
|
||||
continue
|
||||
|
||||
print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)")
|
||||
|
||||
return all_questions
|
||||
|
||||
async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse a single PDF chunk.
|
||||
Internal method used by parse_document_with_pdf.
|
||||
"""
|
||||
prompt = """你是一个专业的试题解析专家。请仔细分析这个 PDF 文档,提取其中的所有试题。
|
||||
|
||||
**识别规则**:
|
||||
- PDF 中可能包含中文或英文题目、图片、表格、公式
|
||||
- 题目可能有多种格式,请灵活识别
|
||||
- 即使格式不标准,也请尽量提取题目内容
|
||||
- **重要**:只提取完整的题目,忽略任何不完整的题目(题目被截断、缺少选项、缺少关键信息等)
|
||||
- 如果题目看起来不完整(比如开头或结尾被切断),直接跳过该题目
|
||||
- 题目内容如果包含代码或换行,请将换行符替换为\\n
|
||||
- 图片中的文字也要识别并提取
|
||||
|
||||
@@ -492,8 +580,8 @@ class LLMService:
|
||||
- **只返回一个 JSON 数组**,不要包含其他任何内容"""
|
||||
|
||||
try:
|
||||
print(f"[Gemini PDF] Processing PDF: {filename}", flush=True)
|
||||
print(f"[Gemini PDF] File size: {len(pdf_bytes)} bytes", flush=True)
|
||||
print(f"[Gemini PDF] Processing chunk: {chunk_name}", flush=True)
|
||||
print(f"[Gemini PDF] Chunk size: {len(pdf_bytes)} bytes", flush=True)
|
||||
|
||||
# Use Gemini's native PDF processing via REST API
|
||||
import base64
|
||||
|
||||
Reference in New Issue
Block a user