mirror of
https://github.com/handsomezhuzhu/QQuiz.git
synced 2026-02-20 20:10:14 +00:00
超长分段处理
This commit is contained in:
142
backend/dedup_utils.py
Normal file
142
backend/dedup_utils.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Question Deduplication Utilities
|
||||
Provides fuzzy matching algorithms to handle AI-generated variations
|
||||
"""
|
||||
import difflib
|
||||
import re
|
||||
from typing import List, Dict, Any
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""
|
||||
Normalize text for comparison by removing extra whitespace and punctuation variations.
|
||||
|
||||
Args:
|
||||
text: Input text to normalize
|
||||
|
||||
Returns:
|
||||
Normalized text
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
# Convert to lowercase
|
||||
text = text.lower()
|
||||
# Remove extra whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
# Remove common punctuation variations (Chinese/English)
|
||||
text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
|
||||
text = text.replace(':', ':').replace(';', ';').replace('"', '"').replace('"', '"')
|
||||
# Strip leading/trailing whitespace
|
||||
return text.strip()
|
||||
|
||||
|
||||
def calculate_similarity(text1: str, text2: str) -> float:
|
||||
"""
|
||||
Calculate similarity between two texts using multiple methods.
|
||||
|
||||
Uses a combination of:
|
||||
1. SequenceMatcher for character-level similarity (70% weight)
|
||||
2. Jaccard similarity for word-level matching (30% weight)
|
||||
|
||||
Args:
|
||||
text1: First text
|
||||
text2: Second text
|
||||
|
||||
Returns:
|
||||
Similarity score between 0.0 and 1.0
|
||||
"""
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
# Normalize texts
|
||||
norm_text1 = normalize_text(text1)
|
||||
norm_text2 = normalize_text(text2)
|
||||
|
||||
# Exact match after normalization
|
||||
if norm_text1 == norm_text2:
|
||||
return 1.0
|
||||
|
||||
# 1. Character-level similarity using SequenceMatcher (handles typos, minor variations)
|
||||
char_similarity = difflib.SequenceMatcher(None, norm_text1, norm_text2).ratio()
|
||||
|
||||
# 2. Word-level Jaccard similarity (handles word reordering, additions/deletions)
|
||||
words1 = set(norm_text1.split())
|
||||
words2 = set(norm_text2.split())
|
||||
|
||||
if not words1 or not words2:
|
||||
return char_similarity
|
||||
|
||||
intersection = words1.intersection(words2)
|
||||
union = words1.union(words2)
|
||||
jaccard_similarity = len(intersection) / len(union) if union else 0.0
|
||||
|
||||
# Weighted average (character similarity matters more for exact question matching)
|
||||
final_similarity = 0.7 * char_similarity + 0.3 * jaccard_similarity
|
||||
|
||||
return final_similarity
|
||||
|
||||
|
||||
def is_duplicate_question(
|
||||
new_question: Dict[str, Any],
|
||||
existing_questions: List[Dict[str, Any]],
|
||||
threshold: float = 0.85
|
||||
) -> bool:
|
||||
"""
|
||||
Check if a question is duplicate using fuzzy matching.
|
||||
|
||||
Handles AI-generated variations where the same question might have:
|
||||
- Minor wording differences
|
||||
- Extra/missing punctuation
|
||||
- Different whitespace
|
||||
- Slight paraphrasing
|
||||
|
||||
Args:
|
||||
new_question: Question to check (dict with 'content' key)
|
||||
existing_questions: List of questions already processed
|
||||
threshold: Similarity threshold (0.85 = 85% similar is considered duplicate)
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
new_content = new_question.get('content', '')
|
||||
if not new_content:
|
||||
return False
|
||||
|
||||
for existing_q in existing_questions:
|
||||
existing_content = existing_q.get('content', '')
|
||||
if not existing_content:
|
||||
continue
|
||||
|
||||
similarity = calculate_similarity(new_content, existing_content)
|
||||
|
||||
if similarity >= threshold:
|
||||
print(f"[Fuzzy Dedup] Found duplicate (similarity: {similarity:.2%})", flush=True)
|
||||
print(f" New: {new_content[:60]}...", flush=True)
|
||||
print(f" Existing: {existing_content[:60]}...", flush=True)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def deduplicate_questions(
|
||||
questions: List[Dict[str, Any]],
|
||||
threshold: float = 0.85
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Remove duplicate questions from a list using fuzzy matching.
|
||||
|
||||
Args:
|
||||
questions: List of questions to deduplicate
|
||||
threshold: Similarity threshold for fuzzy matching
|
||||
|
||||
Returns:
|
||||
List of unique questions
|
||||
"""
|
||||
unique_questions = []
|
||||
|
||||
for q in questions:
|
||||
if not is_duplicate_question(q, unique_questions, threshold):
|
||||
unique_questions.append(q)
|
||||
|
||||
print(f"[Dedup] Reduced from {len(questions)} to {len(unique_questions)} questions")
|
||||
return unique_questions
|
||||
Reference in New Issue
Block a user